From b65a42bc2601f3ddad272ee2c610e84bd3c941e9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:44:49 +0100 Subject: [PATCH 01/43] creating the test for the issue --- .../unit/test_modules_auto_scaling_dynamic.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 461baee21fa..326ae652c3d 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1187,6 +1187,92 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +@pytest.mark.parametrize( + "scale_up_params1, scale_up_params2", + [ + pytest.param( + _ScaleUpParams( + imposed_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB + service_resources=Resources( + cpus=5, ram=TypeAdapter(ByteSize).validate_python("30Gib") + ), + num_services=10, + expected_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB + expected_num_instances=4, + ), + _ScaleUpParams( + imposed_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB + service_resources=Resources( + cpus=5, ram=TypeAdapter(ByteSize).validate_python("20480MB") + ), + num_services=7, + expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB + expected_num_instances=2, + ), + id="Two different instance types are needed", + ), + ], +) +async def test_cluster_adapts_machines_on_the_fly( + minimal_configuration: None, + ec2_client: EC2Client, + initialized_app: FastAPI, + create_service: Callable[ + [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + ], + task_template: dict[str, Any], + create_task_reservations: Callable[[int, int], dict[str, Any]], + service_monitored_labels: dict[DockerLabelKey, str], + osparc_docker_label_keys: StandardSimcoreDockerLabels, + ec2_instance_custom_tags: dict[str, str], + instance_type_filters: Sequence[FilterTypeDef], + scale_up_params1: _ScaleUpParams, + scale_up_params2: _ScaleUpParams, +): + # we have nothing running now + all_instances = await ec2_client.describe_instances() + assert not all_instances["Reservations"] + + # create several tasks that needs more power + await asyncio.gather( + *( + create_service( + task_template + | create_task_reservations( + int(scale_up_params1.service_resources.cpus), + scale_up_params1.service_resources.ram, + ), + service_monitored_labels + | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), + "pending", + ( + [ + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params1.imposed_instance_type}" + ] + if scale_up_params1.imposed_instance_type + else [] + ), + ) + for _ in range(scale_up_params1.num_services) + ) + ) + + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + + # check the instances were started + await assert_autoscaled_dynamic_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + instance_filters=instance_type_filters, + ) + + @pytest.mark.parametrize( "docker_service_imposed_ec2_type, docker_service_ram, expected_ec2_type", [ From 24d600eb9190f0bed3c9f3c95fb41daf001fccdf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:03:46 +0100 Subject: [PATCH 02/43] need to patch when using moto --- .../autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 326ae652c3d..a8fd7873b7b 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1214,6 +1214,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( ], ) async def test_cluster_adapts_machines_on_the_fly( + patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, ec2_client: EC2Client, initialized_app: FastAPI, From 8ce92dac42554b323e2f5aa98bc43b0146f20e91 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:17:15 +0100 Subject: [PATCH 03/43] initial test --- .../unit/test_modules_auto_scaling_dynamic.py | 87 ++++++++++++++----- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index a8fd7873b7b..3246635b2d4 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1194,22 +1194,22 @@ async def test_cluster_scaling_up_starts_multiple_instances( _ScaleUpParams( imposed_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB service_resources=Resources( - cpus=5, ram=TypeAdapter(ByteSize).validate_python("30Gib") + cpus=16, ram=TypeAdapter(ByteSize).validate_python("30Gib") ), - num_services=10, + num_services=12, expected_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB - expected_num_instances=4, + expected_num_instances=10, ), _ScaleUpParams( imposed_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB service_resources=Resources( - cpus=5, ram=TypeAdapter(ByteSize).validate_python("20480MB") + cpus=32, ram=TypeAdapter(ByteSize).validate_python("20480MB") ), num_services=7, expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB - expected_num_instances=2, + expected_num_instances=7, ), - id="Two different instance types are needed", + id="A batch of services requiring g3.4xlarge and a batch requiring g4dn.8xlarge", ), ], ) @@ -1218,6 +1218,7 @@ async def test_cluster_adapts_machines_on_the_fly( minimal_configuration: None, ec2_client: EC2Client, initialized_app: FastAPI, + app_settings: ApplicationSettings, create_service: Callable[ [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] ], @@ -1230,11 +1231,19 @@ async def test_cluster_adapts_machines_on_the_fly( scale_up_params1: _ScaleUpParams, scale_up_params2: _ScaleUpParams, ): + # pre-requisites + assert app_settings.AUTOSCALING_EC2_INSTANCES + assert app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES > 0 + assert ( + scale_up_params1.num_services + >= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ), "this test requires to run a first batch of more services than the maximum number of instances allowed" # we have nothing running now all_instances = await ec2_client.describe_instances() assert not all_instances["Reservations"] - # create several tasks that needs more power + # + # 1. create the first batch of services requiring the initial machines await asyncio.gather( *( create_service( @@ -1257,21 +1266,59 @@ async def test_cluster_adapts_machines_on_the_fly( for _ in range(scale_up_params1.num_services) ) ) + for _ in range(3): + # it will only scale once and do nothing else + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + await assert_autoscaled_dynamic_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + instance_filters=instance_type_filters, + ) - await auto_scale_cluster( - app=initialized_app, auto_scaling_mode=DynamicAutoscaling() - ) - - # check the instances were started - await assert_autoscaled_dynamic_ec2_instances( - ec2_client, - expected_num_reservations=1, - expected_num_instances=scale_up_params1.expected_num_instances, - expected_instance_type=scale_up_params1.expected_instance_type, - expected_instance_state="running", - expected_additional_tag_keys=list(ec2_instance_custom_tags), - instance_filters=instance_type_filters, + # + # 2. now we start the second batch of services requiring a different type of machines + await asyncio.gather( + *( + create_service( + task_template + | create_task_reservations( + int(scale_up_params2.service_resources.cpus), + scale_up_params2.service_resources.ram, + ), + service_monitored_labels + | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), + "pending", + ( + [ + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params2.imposed_instance_type}" + ] + if scale_up_params2.imposed_instance_type + else [] + ), + ) + for _ in range(scale_up_params2.num_services) + ) ) + for _ in range(3): + # scaling will do nothing since we have hit the maximum number of machines + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + await assert_autoscaled_dynamic_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + instance_filters=instance_type_filters, + ) @pytest.mark.parametrize( From 42c89a02649d3b67e27904b2d46729c94a7358f4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:25:42 +0100 Subject: [PATCH 04/43] parametrize --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 3246635b2d4..0ee7d7861b6 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1187,6 +1187,14 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +@pytest.mark.parametrize( + "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True +) +@pytest.mark.parametrize( + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params1, scale_up_params2", [ From 9df58c2e49bd61f563f6ea9257bd8abf5babc093 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:43:03 +0100 Subject: [PATCH 05/43] test ready for dynamic --- .../unit/test_modules_auto_scaling_dynamic.py | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 0ee7d7861b6..6b989333295 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -9,6 +9,7 @@ import asyncio import datetime import logging +import random from collections.abc import AsyncIterator, Awaitable, Callable, Iterator, Sequence from copy import deepcopy from dataclasses import dataclass @@ -1236,6 +1237,7 @@ async def test_cluster_adapts_machines_on_the_fly( osparc_docker_label_keys: StandardSimcoreDockerLabels, ec2_instance_custom_tags: dict[str, str], instance_type_filters: Sequence[FilterTypeDef], + async_docker_client: aiodocker.Docker, scale_up_params1: _ScaleUpParams, scale_up_params2: _ScaleUpParams, ): @@ -1252,7 +1254,7 @@ async def test_cluster_adapts_machines_on_the_fly( # # 1. create the first batch of services requiring the initial machines - await asyncio.gather( + first_batch_services = await asyncio.gather( *( create_service( task_template @@ -1313,8 +1315,8 @@ async def test_cluster_adapts_machines_on_the_fly( for _ in range(scale_up_params2.num_services) ) ) + # scaling will do nothing since we have hit the maximum number of machines for _ in range(3): - # scaling will do nothing since we have hit the maximum number of machines await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) @@ -1328,6 +1330,46 @@ async def test_cluster_adapts_machines_on_the_fly( instance_filters=instance_type_filters, ) + # now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max + # a machine should switch off and another type should be started + completed_services_to_stop = random.sample( + first_batch_services, + scale_up_params1.num_services + - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + + 1, + ) + await asyncio.gather( + *( + async_docker_client.services.delete(s.id) + for s in completed_services_to_stop + if s.id + ) + ) + + for _ in range(3): + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + all_instances = await ec2_client.describe_instances() + assert len(all_instances["Reservations"]) == 2, "there should be 2 Reservations" + reservation1 = all_instances["Reservations"][0] + assert "Instances" in reservation1 + assert len(reservation1["Instances"]) == ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - 1 + ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES-1} EC2 instances, found {len(reservation1['Instances'])}" + for instance in reservation1["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params1.expected_instance_type + + reservation2 = all_instances["Reservations"][0] + assert "Instances" in reservation2 + assert len(reservation2["Instances"]) == ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - 1 + ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES-1} EC2 instances, found {len(reservation2['Instances'])}" + for instance in reservation2["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params2.expected_instance_type + @pytest.mark.parametrize( "docker_service_imposed_ec2_type, docker_service_ram, expected_ec2_type", From f3971418f917941d8b32b87d9be1f1ee9e6edf55 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:43:36 +0100 Subject: [PATCH 06/43] ongoing test --- .../modules/auto_scaling_core.py | 59 +++++----- .../unit/test_modules_auto_scaling_dynamic.py | 108 +++++++++++++++--- 2 files changed, 125 insertions(+), 42 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 8d5ff16dd9a..909a30c9de5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -1116,16 +1116,26 @@ async def _drain_retired_nodes( ) +async def _scale_down_unused_cluster_machines( + app: FastAPI, + cluster: Cluster, + auto_scaling_mode: BaseAutoscaling, +) -> Cluster: + await auto_scaling_mode.try_retire_nodes(app) + cluster = await _deactivate_empty_nodes(app, cluster) + return await _try_scale_down_cluster(app, cluster) + + async def _autoscale_cluster( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling, allowed_instance_types: list[EC2InstanceType], ) -> Cluster: - # 1. check if we have pending tasks and resolve them by activating some drained nodes + # 1. check if we have pending tasks unrunnable_tasks = await auto_scaling_mode.list_unrunnable_tasks(app) _logger.info("found %s unrunnable tasks", len(unrunnable_tasks)) - # NOTE: this function predicts how dask will assign a task to a machine + # NOTE: this function predicts how the backend will assign tasks queued_or_missing_instance_tasks, cluster = await _assign_tasks_to_current_cluster( app, unrunnable_tasks, cluster, auto_scaling_mode ) @@ -1135,41 +1145,34 @@ async def _autoscale_cluster( # 3. start buffer instances to cover the remaining tasks cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode) + # 4. scale down unused machines + cluster = await _scale_down_unused_cluster_machines(app, cluster, auto_scaling_mode) + # 4. let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - if queued_or_missing_instance_tasks or ( - len(cluster.buffer_drained_nodes) - < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ): - if ( + if ( + queued_or_missing_instance_tasks + or ( + len(cluster.buffer_drained_nodes) + < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) + and ( cluster.total_number_of_machines() < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - ): - _logger.info( - "%s unrunnable tasks could not be assigned, slowly trying to scale up...", - len(queued_or_missing_instance_tasks), - ) - cluster = await _scale_up_cluster( - app, - cluster, - queued_or_missing_instance_tasks, - auto_scaling_mode, - allowed_instance_types, - ) - - elif ( - len(queued_or_missing_instance_tasks) == len(unrunnable_tasks) == 0 - and cluster.can_scale_down() + ) ): _logger.info( - "there is %s waiting task, slowly and gracefully scaling down...", + "%s unrunnable tasks could not be assigned, slowly trying to scale up...", len(queued_or_missing_instance_tasks), ) - # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust - await auto_scaling_mode.try_retire_nodes(app) - cluster = await _deactivate_empty_nodes(app, cluster) - cluster = await _try_scale_down_cluster(app, cluster) + cluster = await _scale_up_cluster( + app, + cluster, + queued_or_missing_instance_tasks, + auto_scaling_mode, + allowed_instance_types, + ) return cluster diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 6b989333295..7aedbf19fd3 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1188,6 +1188,20 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +@pytest.fixture +async def mocked_associate_ec2_instances_with_nodes(mocker: MockerFixture) -> mock.Mock: + async def _( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + return [], ec2_instances + + return mocker.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.associate_ec2_instances_with_nodes", + autospec=True, + side_effect=_, + ) + + @pytest.mark.parametrize( "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True ) @@ -1240,6 +1254,11 @@ async def test_cluster_adapts_machines_on_the_fly( async_docker_client: aiodocker.Docker, scale_up_params1: _ScaleUpParams, scale_up_params2: _ScaleUpParams, + mocked_associate_ec2_instances_with_nodes: mock.Mock, + create_fake_node: Callable[..., Node], + mock_docker_tag_node: mock.Mock, + mock_compute_node_used_resources: mock.Mock, + spied_cluster_analysis: MockType, ): # pre-requisites assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -1276,20 +1295,73 @@ async def test_cluster_adapts_machines_on_the_fly( for _ in range(scale_up_params1.num_services) ) ) - for _ in range(3): - # it will only scale once and do nothing else - await auto_scale_cluster( - app=initialized_app, auto_scaling_mode=DynamicAutoscaling() - ) - await assert_autoscaled_dynamic_ec2_instances( - ec2_client, - expected_num_reservations=1, - expected_num_instances=scale_up_params1.expected_num_instances, - expected_instance_type=scale_up_params1.expected_instance_type, - expected_instance_state="running", - expected_additional_tag_keys=list(ec2_instance_custom_tags), - instance_filters=instance_type_filters, - ) + + # it will only scale once and do nothing else + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + await assert_autoscaled_dynamic_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + instance_filters=instance_type_filters, + ) + _assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=0, + ) + mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) + mocked_associate_ec2_instances_with_nodes.reset_mock() + + fake_node_to_instance_map = {} + + async def _fake_node_creator( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: + if instance not in fake_node_to_instance_map: + fake_node = create_fake_node() + assert fake_node.spec + fake_node.spec.availability = Availability.active + assert fake_node.status + fake_node.status.state = NodeState.ready + assert fake_node.spec.labels + fake_node.spec.labels |= { + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), + _OSPARC_SERVICE_READY_LABEL_KEY: "true", + } + fake_node_to_instance_map[instance] = fake_node + return fake_node_to_instance_map[instance] + + associated_instances = [ + AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) + for i in ec2_instances + ] + + return associated_instances, [] + + mocked_associate_ec2_instances_with_nodes.side_effect = _fake_node_creator + + # + # 2. now the machines are associated + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + _assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + mocked_associate_ec2_instances_with_nodes.assert_called_once() + mock_docker_tag_node.assert_called() + assert ( + mock_docker_tag_node.call_count + == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ) # # 2. now we start the second batch of services requiring a different type of machines @@ -1330,6 +1402,10 @@ async def test_cluster_adapts_machines_on_the_fly( instance_filters=instance_type_filters, ) + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert spied_cluster_analysis.spy_return.active_nodes + assert not spied_cluster_analysis.spy_return.drained_nodes + # now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max # a machine should switch off and another type should be started completed_services_to_stop = random.sample( @@ -1350,6 +1426,10 @@ async def test_cluster_adapts_machines_on_the_fly( await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + + assert spied_cluster_analysis.spy_return.active_nodes + assert not spied_cluster_analysis.spy_return.drained_nodes + all_instances = await ec2_client.describe_instances() assert len(all_instances["Reservations"]) == 2, "there should be 2 Reservations" reservation1 = all_instances["Reservations"][0] From 9e02dc43052532b39c60dd9c5b733b9316b72e51 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 08:56:47 +0100 Subject: [PATCH 07/43] sort nodes --- .../utils/utils_docker.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index 4758c91a12f..65caa0f40b1 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -82,19 +82,26 @@ ).validate_python("io.simcore.osparc-node-termination-started") +def _get_node_creation_date(node: Node) -> datetime.datetime: + assert node.created_at # nosec + return arrow.get(node.created_at).datetime + + async def get_monitored_nodes( docker_client: AutoscalingDocker, node_labels: list[DockerLabelKey] ) -> list[Node]: node_label_filters = [f"{label}=true" for label in node_labels] + [ f"{label}" for label in _OSPARC_SERVICE_READY_LABEL_KEYS ] - return TypeAdapter(list[Node]).validate_python( + list_of_nodes = TypeAdapter(list[Node]).validate_python( await docker_client.nodes.list(filters={"node.label": node_label_filters}) ) + list_of_nodes.sort(key=_get_node_creation_date) + return list_of_nodes async def get_worker_nodes(docker_client: AutoscalingDocker) -> list[Node]: - return TypeAdapter(list[Node]).validate_python( + list_of_nodes = TypeAdapter(list[Node]).validate_python( await docker_client.nodes.list( filters={ "role": ["worker"], @@ -104,6 +111,8 @@ async def get_worker_nodes(docker_client: AutoscalingDocker) -> list[Node]: } ) ) + list_of_nodes.sort(key=_get_node_creation_date) + return list_of_nodes async def remove_nodes( From e65be936f6b9561d5a1d0a7d2283fef68c2b5785 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:16:49 +0100 Subject: [PATCH 08/43] check sorting --- .../tests/unit/test_utils_docker.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/services/autoscaling/tests/unit/test_utils_docker.py b/services/autoscaling/tests/unit/test_utils_docker.py index 3f9677112bb..90f214ee530 100644 --- a/services/autoscaling/tests/unit/test_utils_docker.py +++ b/services/autoscaling/tests/unit/test_utils_docker.py @@ -169,6 +169,25 @@ async def test_get_monitored_nodes_with_valid_label( ) +async def test_get_monitored_nodes_are_sorted_according_to_creation_date( + mocker: MockerFixture, + autoscaling_docker: AutoscalingDocker, + create_fake_node: Callable[..., Node], + faker: Faker, +): + fake_nodes = [ + create_fake_node(CreatedAt=faker.date_time(tzinfo=datetime.UTC).isoformat()) + for _ in range(10) + ] + mocked_aiodocker = mocker.patch.object(autoscaling_docker, "nodes", autospec=True) + mocked_aiodocker.list.return_value = fake_nodes + monitored_nodes = await get_monitored_nodes(autoscaling_docker, node_labels=[]) + assert len(monitored_nodes) == len(fake_nodes) + sorted_fake_nodes = sorted(fake_nodes, key=lambda node: arrow.get(node.created_at)) + assert monitored_nodes == sorted_fake_nodes + assert monitored_nodes[0].created_at < monitored_nodes[1].created_at + + async def test_worker_nodes( autoscaling_docker: AutoscalingDocker, host_node: Node, From 509a9b19a180893dcc88ab5de989e8943105d0c8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:17:09 +0100 Subject: [PATCH 09/43] ruff --- services/autoscaling/tests/unit/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index eccee9967b5..363e059ef2a 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -492,8 +492,8 @@ def _creator(**node_overrides) -> DockerNode: default_config = { "ID": faker.uuid4(), "Version": ObjectVersion(Index=faker.pyint()), - "CreatedAt": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), - "UpdatedAt": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), + "CreatedAt": datetime.datetime.now(tz=datetime.UTC).isoformat(), + "UpdatedAt": datetime.datetime.now(tz=datetime.UTC).isoformat(), "Description": NodeDescription( Hostname=faker.pystr(), Resources=ResourceObject( From abfd35c9a4b7820564cddfc3e757c63684aa620d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:56:04 +0100 Subject: [PATCH 10/43] refactor --- services/autoscaling/tests/unit/conftest.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 363e059ef2a..282a321ea6c 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -11,7 +11,7 @@ from collections.abc import AsyncIterator, Awaitable, Callable, Iterator from copy import deepcopy from pathlib import Path -from typing import Any, Final, cast, get_args +from typing import Any, Final, TypeAlias, cast, get_args from unittest import mock import aiodocker @@ -529,7 +529,7 @@ def task_template() -> dict[str, Any]: _GIGA_NANO_CPU = 10**9 -NUM_CPUS = PositiveInt +NUM_CPUS: TypeAlias = PositiveInt @pytest.fixture @@ -761,7 +761,9 @@ def aws_allowed_ec2_instance_type_names_env( @pytest.fixture def host_cpu_count() -> int: - return psutil.cpu_count() + cpus = psutil.cpu_count() + assert cpus is not None + return cpus @pytest.fixture @@ -853,9 +855,7 @@ async def _fake_set_node_availability( returned_node.spec.availability = ( Availability.active if available else Availability.drain ) - returned_node.updated_at = datetime.datetime.now( - tz=datetime.timezone.utc - ).isoformat() + returned_node.updated_at = datetime.datetime.now(tz=datetime.UTC).isoformat() return returned_node return mocker.patch( @@ -954,7 +954,7 @@ def _creator( return AssociatedInstance( node=node, ec2_instance=fake_ec2_instance_data( - launch_time=datetime.datetime.now(datetime.timezone.utc) + launch_time=datetime.datetime.now(datetime.UTC) - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION - datetime.timedelta( days=faker.pyint(min_value=0, max_value=100), From 077d9a7d50db37f854d2fa4dc6e38a97cb772e5e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:57:08 +0100 Subject: [PATCH 11/43] fixed terminating and terminated --- .../modules/auto_scaling_core.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 909a30c9de5..38a90a7de49 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -265,9 +265,7 @@ async def _make_pending_buffer_ec2s_join_cluster( await ec2_client.set_instances_tags( buffer_ec2_ready_for_command, tags={ - DOCKER_JOIN_COMMAND_EC2_TAG_KEY: AWSTagValue( - ssm_command.command_id - ), + DOCKER_JOIN_COMMAND_EC2_TAG_KEY: ssm_command.command_id, }, ) return cluster @@ -341,10 +339,10 @@ async def _sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]: allowed_instance_type_names ), "EC2_INSTANCES_ALLOWED_TYPES cannot be empty!" - allowed_instance_types: list[ - EC2InstanceType - ] = await ec2_client.get_ec2_instance_capabilities( - cast(set[InstanceTypeType], set(allowed_instance_type_names)) + allowed_instance_types: list[EC2InstanceType] = ( + await ec2_client.get_ec2_instance_capabilities( + cast(set[InstanceTypeType], set(allowed_instance_type_names)) + ) ) def _as_selection(instance_type: EC2InstanceType) -> int: @@ -945,7 +943,7 @@ async def _find_terminateable_instances( for instance in cluster.drained_nodes: node_last_updated = utils_docker.get_node_last_readyness_update(instance.node) elapsed_time_since_drained = ( - datetime.datetime.now(datetime.timezone.utc) - node_last_updated + datetime.datetime.now(datetime.UTC) - node_last_updated ) _logger.debug("%s", f"{node_last_updated=}, {elapsed_time_since_drained=}") if ( @@ -985,6 +983,9 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: get_docker_client(app), instance.node ) new_terminating_instances.append(instance) + new_terminating_instance_ids = [ + i.ec2_instance.id for i in new_terminating_instances + ] # instances that are in the termination process and already waited long enough are terminated. now = arrow.utcnow().datetime @@ -1016,7 +1017,8 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: still_drained_nodes = [ i for i in cluster.drained_nodes - if i.ec2_instance.id not in terminated_instance_ids + if i.ec2_instance.id + not in (new_terminating_instance_ids + terminated_instance_ids) ] return dataclasses.replace( cluster, @@ -1043,7 +1045,7 @@ async def _notify_based_on_machine_type( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) launch_time_to_tasks: dict[datetime.datetime, list] = collections.defaultdict(list) - now = datetime.datetime.now(datetime.timezone.utc) + now = datetime.datetime.now(datetime.UTC) for instance in instances: launch_time_to_tasks[ instance.ec2_instance.launch_time From 608e536572e8cdf82553fc1ff02ceb1fc12ac35a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:57:25 +0100 Subject: [PATCH 12/43] refactor --- .../unit/test_modules_auto_scaling_dynamic.py | 249 +++++++++++------- 1 file changed, 156 insertions(+), 93 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 7aedbf19fd3..1aa85575161 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -15,6 +15,7 @@ from dataclasses import dataclass from typing import Any, cast from unittest import mock +import unittest import aiodocker import arrow @@ -193,8 +194,7 @@ def minimal_configuration( disable_dynamic_service_background_task: None, disable_buffers_pool_background_task: None, mocked_redis_server: None, -) -> None: - ... +) -> None: ... def _assert_rabbit_autoscaling_message_sent( @@ -415,6 +415,7 @@ def _assert_cluster_state( spied_cluster_analysis.spy_return.total_number_of_machines() == expected_num_machines ) + print("current cluster state:", spied_cluster_analysis.spy_return) async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 @@ -573,11 +574,11 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: available=with_drain_nodes_labelled, ) # update our fake node - fake_attached_node.spec.labels[ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] = mock_docker_tag_node.call_args_list[0][1]["tags"][ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] + fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( + mock_docker_tag_node.call_args_list[0][1]["tags"][ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] + ) # check the activate time is later than attach time assert arrow.get( mock_docker_tag_node.call_args_list[1][1]["tags"][ @@ -606,11 +607,11 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: available=True, ) # update our fake node - fake_attached_node.spec.labels[ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] = mock_docker_tag_node.call_args_list[1][1]["tags"][ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] + fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( + mock_docker_tag_node.call_args_list[1][1]["tags"][ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] + ) mock_docker_tag_node.reset_mock() mock_docker_set_node_availability.assert_not_called() @@ -796,9 +797,9 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: if not with_drain_nodes_labelled: fake_attached_node.spec.availability = Availability.drain fake_attached_node.spec.labels[_OSPARC_SERVICE_READY_LABEL_KEY] = "false" - fake_attached_node.spec.labels[ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] = datetime.datetime.now(tz=datetime.UTC).isoformat() + fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( + datetime.datetime.now(tz=datetime.UTC).isoformat() + ) # the node will not be terminated before the timeout triggers assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -1202,6 +1203,83 @@ async def _( ) +@pytest.fixture +async def create_services_batch( + create_service: Callable[ + [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + ], + task_template: dict[str, Any], + create_task_reservations: Callable[[int, int], dict[str, Any]], + service_monitored_labels: dict[DockerLabelKey, str], + osparc_docker_label_keys: StandardSimcoreDockerLabels, +) -> Callable[[Resources, str | None, int], Awaitable[list[Service]]]: + async def _( + service_resources: Resources, + imposed_instance_type: str | None, + num_services: int, + ) -> list[Service]: + return await asyncio.gather( + *( + create_service( + task_template + | create_task_reservations( + int(service_resources.cpus), + service_resources.ram, + ), + service_monitored_labels + | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), + "pending", + ( + [ + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={imposed_instance_type}" + ] + if imposed_instance_type + else [] + ), + ) + for _ in range(num_services) + ) + ) + + return _ + + +def _create_fake_association( + create_fake_node: Callable, drained_machine_id: str | None +): + fake_node_to_instance_map = {} + + async def _fake_node_creator( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: + if instance not in fake_node_to_instance_map: + fake_node = create_fake_node() + assert fake_node.spec + fake_node.spec.availability = Availability.active + assert fake_node.status + fake_node.status.state = NodeState.ready + assert fake_node.spec.labels + fake_node.spec.labels |= { + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), + _OSPARC_SERVICE_READY_LABEL_KEY: ( + "true" if instance.id != drained_machine_id else "false" + ), + } + fake_node_to_instance_map[instance] = fake_node + return fake_node_to_instance_map[instance] + + associated_instances = [ + AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) + for i in ec2_instances + ] + + return associated_instances, [] + + return _fake_node_creator + + +@pytest.mark.testit @pytest.mark.parametrize( "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True ) @@ -1236,19 +1314,15 @@ async def _( ), ], ) -async def test_cluster_adapts_machines_on_the_fly( +async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, ec2_client: EC2Client, initialized_app: FastAPI, app_settings: ApplicationSettings, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + create_services_batch: Callable[ + [Resources, str | None, int], Awaitable[list[Service]] ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, ec2_instance_custom_tags: dict[str, str], instance_type_filters: Sequence[FilterTypeDef], async_docker_client: aiodocker.Docker, @@ -1273,27 +1347,10 @@ async def test_cluster_adapts_machines_on_the_fly( # # 1. create the first batch of services requiring the initial machines - first_batch_services = await asyncio.gather( - *( - create_service( - task_template - | create_task_reservations( - int(scale_up_params1.service_resources.cpus), - scale_up_params1.service_resources.ram, - ), - service_monitored_labels - | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params1.imposed_instance_type}" - ] - if scale_up_params1.imposed_instance_type - else [] - ), - ) - for _ in range(scale_up_params1.num_services) - ) + first_batch_services = await create_services_batch( + scale_up_params1.service_resources, + scale_up_params1.imposed_instance_type, + scale_up_params1.num_services, ) # it will only scale once and do nothing else @@ -1316,35 +1373,9 @@ async def test_cluster_adapts_machines_on_the_fly( ) mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) mocked_associate_ec2_instances_with_nodes.reset_mock() - - fake_node_to_instance_map = {} - - async def _fake_node_creator( - nodes: list[Node], ec2_instances: list[EC2InstanceData] - ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: - def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: - if instance not in fake_node_to_instance_map: - fake_node = create_fake_node() - assert fake_node.spec - fake_node.spec.availability = Availability.active - assert fake_node.status - fake_node.status.state = NodeState.ready - assert fake_node.spec.labels - fake_node.spec.labels |= { - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), - _OSPARC_SERVICE_READY_LABEL_KEY: "true", - } - fake_node_to_instance_map[instance] = fake_node - return fake_node_to_instance_map[instance] - - associated_instances = [ - AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) - for i in ec2_instances - ] - - return associated_instances, [] - - mocked_associate_ec2_instances_with_nodes.side_effect = _fake_node_creator + mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + create_fake_node, None + ) # # 2. now the machines are associated @@ -1365,28 +1396,12 @@ def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: # # 2. now we start the second batch of services requiring a different type of machines - await asyncio.gather( - *( - create_service( - task_template - | create_task_reservations( - int(scale_up_params2.service_resources.cpus), - scale_up_params2.service_resources.ram, - ), - service_monitored_labels - | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params2.imposed_instance_type}" - ] - if scale_up_params2.imposed_instance_type - else [] - ), - ) - for _ in range(scale_up_params2.num_services) - ) + await create_services_batch( + scale_up_params2.service_resources, + scale_up_params2.imposed_instance_type, + scale_up_params2.num_services, ) + # scaling will do nothing since we have hit the maximum number of machines for _ in range(3): await auto_scale_cluster( @@ -1422,6 +1437,54 @@ def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: ) ) + # first call to auto_scale_cluster will mark 1 node as empty + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.set_node_found_empty", + autospec=True, + ) as mock_docker_set_node_found_empty: + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert spied_cluster_analysis.spy_return.active_nodes + assert not spied_cluster_analysis.spy_return.drained_nodes + + # the last machine is found empty + mock_docker_set_node_found_empty.assert_called_with( + mock.ANY, + spied_cluster_analysis.spy_return.active_nodes[-1].node, + empty=True, + ) + + # now we mock the get_node_found_empty so the next call will actually drain the machine + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_empty_since", + autospec=True, + return_value=arrow.utcnow().datetime + - 2 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING, + ) as mocked_get_node_empty_since: + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + mocked_get_node_empty_since.assert_called_once() + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert spied_cluster_analysis.spy_return.active_nodes + assert not spied_cluster_analysis.spy_return.drained_nodes + # now the drained machine + drained_machine_instance_id = spied_cluster_analysis.spy_return.active_nodes[ + -1 + ].ec2_instance.id + mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + create_fake_node, drained_machine_instance_id + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + assert spied_cluster_analysis.spy_return.active_nodes + assert spied_cluster_analysis.spy_return.drained_nodes + # mock get_node_empty_since > app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING + # auto_scale, this will drain the node + for _ in range(3): await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() From 2572c9fbcb9bdb0db804aab07e7a6e35dc2fa658 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:54:28 +0100 Subject: [PATCH 13/43] test now finally works --- .../unit/test_modules_auto_scaling_dynamic.py | 140 ++++++++++++------ 1 file changed, 95 insertions(+), 45 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 1aa85575161..d6c59165fc4 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -15,7 +15,6 @@ from dataclasses import dataclass from typing import Any, cast from unittest import mock -import unittest import aiodocker import arrow @@ -194,7 +193,8 @@ def minimal_configuration( disable_dynamic_service_background_task: None, disable_buffers_pool_background_task: None, mocked_redis_server: None, -) -> None: ... +) -> None: + ... def _assert_rabbit_autoscaling_message_sent( @@ -574,11 +574,11 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: available=with_drain_nodes_labelled, ) # update our fake node - fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( - mock_docker_tag_node.call_args_list[0][1]["tags"][ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] - ) + fake_attached_node.spec.labels[ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] = mock_docker_tag_node.call_args_list[0][1]["tags"][ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] # check the activate time is later than attach time assert arrow.get( mock_docker_tag_node.call_args_list[1][1]["tags"][ @@ -607,11 +607,11 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: available=True, ) # update our fake node - fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( - mock_docker_tag_node.call_args_list[1][1]["tags"][ - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] - ) + fake_attached_node.spec.labels[ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] = mock_docker_tag_node.call_args_list[1][1]["tags"][ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] mock_docker_tag_node.reset_mock() mock_docker_set_node_availability.assert_not_called() @@ -797,9 +797,9 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: if not with_drain_nodes_labelled: fake_attached_node.spec.availability = Availability.drain fake_attached_node.spec.labels[_OSPARC_SERVICE_READY_LABEL_KEY] = "false" - fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( - datetime.datetime.now(tz=datetime.UTC).isoformat() - ) + fake_attached_node.spec.labels[ + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY + ] = datetime.datetime.now(tz=datetime.UTC).isoformat() # the node will not be terminated before the timeout triggers assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -901,7 +901,7 @@ async def _assert_wait_for_ec2_instances_terminated() -> None: await _assert_wait_for_ec2_instances_terminated() -@pytest.mark.acceptance_test() +@pytest.mark.acceptance_test @pytest.mark.parametrize( "scale_up_params", [ @@ -1245,7 +1245,9 @@ async def _( def _create_fake_association( - create_fake_node: Callable, drained_machine_id: str | None + create_fake_node: Callable, + drained_machine_id: str | None, + terminating_machine_id: str | None, ): fake_node_to_instance_map = {} @@ -1266,6 +1268,10 @@ def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: "true" if instance.id != drained_machine_id else "false" ), } + if instance.id == terminating_machine_id: + fake_node.spec.labels |= { + _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY: arrow.utcnow().isoformat() + } fake_node_to_instance_map[instance] = fake_node return fake_node_to_instance_map[instance] @@ -1279,7 +1285,6 @@ def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: return _fake_node_creator -@pytest.mark.testit @pytest.mark.parametrize( "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True ) @@ -1333,6 +1338,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 mock_docker_tag_node: mock.Mock, mock_compute_node_used_resources: mock.Mock, spied_cluster_analysis: MockType, + mocker: MockerFixture, ): # pre-requisites assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -1374,7 +1380,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) mocked_associate_ec2_instances_with_nodes.reset_mock() mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( - create_fake_node, None + create_fake_node, None, None ) # @@ -1461,7 +1467,8 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_empty_since", autospec=True, return_value=arrow.utcnow().datetime - - 2 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING, + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING, ) as mocked_get_node_empty_since: await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() @@ -1475,43 +1482,86 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 -1 ].ec2_instance.id mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( - create_fake_node, drained_machine_instance_id + create_fake_node, drained_machine_instance_id, None ) await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) assert spied_cluster_analysis.spy_return.active_nodes assert spied_cluster_analysis.spy_return.drained_nodes - # mock get_node_empty_since > app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING - # auto_scale, this will drain the node - for _ in range(3): + # this will initiate termination now + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_last_readyness_update", + autospec=True, + return_value=arrow.utcnow().datetime + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION, + ): + mock_docker_tag_node.reset_mock() await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + mock_docker_tag_node.assert_called_with( + mock.ANY, + spied_cluster_analysis.spy_return.drained_nodes[-1].node, + tags=mock.ANY, + available=False, + ) - assert spied_cluster_analysis.spy_return.active_nodes - assert not spied_cluster_analysis.spy_return.drained_nodes + # and this should now recognize the node as terminating + mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + create_fake_node, drained_machine_instance_id, drained_machine_instance_id + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + assert spied_cluster_analysis.spy_return.active_nodes + assert not spied_cluster_analysis.spy_return.drained_nodes + assert spied_cluster_analysis.spy_return.terminating_nodes - all_instances = await ec2_client.describe_instances() - assert len(all_instances["Reservations"]) == 2, "there should be 2 Reservations" - reservation1 = all_instances["Reservations"][0] - assert "Instances" in reservation1 - assert len(reservation1["Instances"]) == ( - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - 1 - ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES-1} EC2 instances, found {len(reservation1['Instances'])}" - for instance in reservation1["Instances"]: - assert "InstanceType" in instance - assert instance["InstanceType"] == scale_up_params1.expected_instance_type - - reservation2 = all_instances["Reservations"][0] - assert "Instances" in reservation2 - assert len(reservation2["Instances"]) == ( - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - 1 - ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES-1} EC2 instances, found {len(reservation2['Instances'])}" - for instance in reservation2["Instances"]: - assert "InstanceType" in instance - assert instance["InstanceType"] == scale_up_params2.expected_instance_type + # now this will terminate it and straight away start a new machine type + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_termination_started_since", + autospec=True, + return_value=arrow.utcnow().datetime + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION, + ): + mocked_docker_remove_node = mocker.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.remove_nodes", + return_value=None, + autospec=True, + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + mocked_docker_remove_node.assert_called_once() + + # now let's check what we have + all_instances = await ec2_client.describe_instances() + assert len(all_instances["Reservations"]) == 2, "there should be 2 Reservations" + reservation1 = all_instances["Reservations"][0] + assert "Instances" in reservation1 + assert len(reservation1["Instances"]) == ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES} EC2 instances, found {len(reservation1['Instances'])}" + for instance in reservation1["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params1.expected_instance_type + if instance["InstanceId"] == drained_machine_instance_id: + assert instance["State"]["Name"] == "terminated" + else: + assert instance["State"]["Name"] == "running" + + reservation2 = all_instances["Reservations"][1] + assert "Instances" in reservation2 + assert ( + len(reservation2["Instances"]) == 1 + ), f"expected 1 EC2 instances, found {len(reservation2['Instances'])}" + for instance in reservation2["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params2.expected_instance_type @pytest.mark.parametrize( From e3cf731b9165910be0ef424bb3fd162ff3551514 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:10:27 +0100 Subject: [PATCH 14/43] fixed test that started failing, but is correct --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index d6c59165fc4..43b1666aa3b 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -558,7 +558,7 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: mock_find_node_with_name_returns_fake_node.assert_called_once() mock_find_node_with_name_returns_fake_node.reset_mock() - assert mock_docker_tag_node.call_count == 2 + assert mock_docker_tag_node.call_count == 3 assert fake_node.spec assert fake_node.spec.labels # check attach call @@ -678,7 +678,8 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: assert mock_compute_node_used_resources.call_count == num_useless_calls * 2 mock_compute_node_used_resources.reset_mock() mock_find_node_with_name_returns_fake_node.assert_not_called() - mock_docker_tag_node.assert_not_called() + assert mock_docker_tag_node.call_count == num_useless_calls + mock_docker_tag_node.reset_mock() mock_docker_set_node_availability.assert_not_called() # check the number of instances did not change and is still running instances = await assert_autoscaled_dynamic_ec2_instances( From a9fea5c03c8a5a841d3eb34c183c3ca2ea96962c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:15:09 +0100 Subject: [PATCH 15/43] fixed test that started failing, but is correct in computational mode --- .../tests/unit/test_modules_auto_scaling_computational.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index f9e0e4c416d..35cb04d1992 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -458,7 +458,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 expected_docker_node_tags = { DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY: expected_ec2_type } - assert mock_docker_tag_node.call_count == 2 + assert mock_docker_tag_node.call_count == 3 assert fake_node.spec assert fake_node.spec.labels fake_attached_node = deepcopy(fake_node) @@ -557,7 +557,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 assert mock_dask_get_worker_used_resources.call_count == 2 * num_useless_calls mock_dask_get_worker_used_resources.reset_mock() mock_docker_find_node_with_name_returns_fake_node.assert_not_called() - mock_docker_tag_node.assert_not_called() + assert mock_docker_tag_node.call_count == num_useless_calls + mock_docker_tag_node.reset_mock() mock_docker_set_node_availability.assert_not_called() # check the number of instances did not change and is still running await assert_autoscaled_computational_ec2_instances( From a1c65beaee984060bf0b2f0483f54163c82ecd57 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:40:38 +0100 Subject: [PATCH 16/43] refactoring --- .../unit/test_modules_auto_scaling_dynamic.py | 94 ++++--------------- 1 file changed, 18 insertions(+), 76 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 43b1666aa3b..ca934d450f1 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -420,15 +420,11 @@ def _assert_cluster_state( async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 *, - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + create_services_batch: Callable[ + [Resources, str | None, int], Awaitable[list[Service]] ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -454,27 +450,10 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 ), "This test is not made to work with more than 1 expected instance. so please adapt if needed" # create the service(s) - created_docker_services = await asyncio.gather( - *( - create_service( - task_template - | create_task_reservations( - int(scale_up_params.service_resources.cpus), - scale_up_params.service_resources.ram, - ), - service_monitored_labels - | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params.imposed_instance_type}" - ] - if scale_up_params.imposed_instance_type - else [] - ), - ) - for _ in range(scale_up_params.num_services) - ) + created_docker_services = await create_services_batch( + scale_up_params.service_resources, + scale_up_params.imposed_instance_type, + scale_up_params.num_services, ) # this should trigger a scaling up as we have no nodes @@ -946,15 +925,11 @@ async def _assert_wait_for_ec2_instances_terminated() -> None: ) async def test_cluster_scaling_up_and_down( minimal_configuration: None, - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + create_services_batch: Callable[ + [Resources, str | None, int], Awaitable[list[Service]] ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -971,13 +946,9 @@ async def test_cluster_scaling_up_and_down( spied_cluster_analysis: MockType, ): await _test_cluster_scaling_up_and_down( - service_monitored_labels=service_monitored_labels, - osparc_docker_label_keys=osparc_docker_label_keys, app_settings=app_settings, initialized_app=initialized_app, - create_service=create_service, - task_template=task_template, - create_task_reservations=create_task_reservations, + create_services_batch=create_services_batch, ec2_client=ec2_client, mock_docker_tag_node=mock_docker_tag_node, fake_node=fake_node, @@ -1024,15 +995,11 @@ async def test_cluster_scaling_up_and_down_against_aws( disable_buffers_pool_background_task: None, mocked_redis_server: None, external_envfile_dict: EnvVarsDict, - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + create_services_batch: Callable[ + [Resources, str | None, int], Awaitable[list[Service]] ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -1057,13 +1024,9 @@ async def test_cluster_scaling_up_and_down_against_aws( f" The passed external ENV allows for {list(external_ec2_instances_allowed_types)}" ) await _test_cluster_scaling_up_and_down( - service_monitored_labels=service_monitored_labels, - osparc_docker_label_keys=osparc_docker_label_keys, app_settings=app_settings, initialized_app=initialized_app, - create_service=create_service, - task_template=task_template, - create_task_reservations=create_task_reservations, + create_services_batch=create_services_batch, ec2_client=ec2_client, mock_docker_tag_node=mock_docker_tag_node, fake_node=fake_node, @@ -1114,15 +1077,11 @@ async def test_cluster_scaling_up_and_down_against_aws( async def test_cluster_scaling_up_starts_multiple_instances( patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + create_services_batch: Callable[ + [Resources, str | None, int], Awaitable[list[Service]] ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, scale_up_params: _ScaleUpParams, @@ -1137,27 +1096,10 @@ async def test_cluster_scaling_up_starts_multiple_instances( assert not all_instances["Reservations"] # create several tasks that needs more power - await asyncio.gather( - *( - create_service( - task_template - | create_task_reservations( - int(scale_up_params.service_resources.cpus), - scale_up_params.service_resources.ram, - ), - service_monitored_labels - | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params.imposed_instance_type}" - ] - if scale_up_params.imposed_instance_type - else [] - ), - ) - for _ in range(scale_up_params.num_services) - ) + await create_services_batch( + scale_up_params.service_resources, + scale_up_params.imposed_instance_type, + scale_up_params.num_services, ) # run the code From 9ce628c148c88a0d082e680a1ab6cbec354fba4c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:24:16 +0100 Subject: [PATCH 17/43] re-organize --- .../unit/test_modules_auto_scaling_dynamic.py | 272 +++++++++--------- 1 file changed, 136 insertions(+), 136 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index ca934d450f1..5dfdb0045e5 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -218,24 +218,6 @@ def _assert_rabbit_autoscaling_message_sent( assert mock_rabbitmq_post_message.call_args == mock.call(app, expected_message) -async def test_cluster_scaling_with_no_services_does_nothing( - minimal_configuration: None, - app_settings: ApplicationSettings, - initialized_app: FastAPI, - mock_launch_instances: mock.Mock, - mock_terminate_instances: mock.Mock, - mock_rabbitmq_post_message: mock.Mock, -): - await auto_scale_cluster( - app=initialized_app, auto_scaling_mode=DynamicAutoscaling() - ) - mock_launch_instances.assert_not_called() - mock_terminate_instances.assert_not_called() - _assert_rabbit_autoscaling_message_sent( - mock_rabbitmq_post_message, app_settings, initialized_app - ) - - @pytest.fixture def instance_type_filters( ec2_instance_custom_tags: dict[str, str], @@ -260,6 +242,142 @@ async def spied_cluster_analysis(mocker: MockerFixture) -> MockType: return mocker.spy(auto_scaling_core, "_analyze_current_cluster") +@dataclass(frozen=True) +class _ScaleUpParams: + imposed_instance_type: str | None + service_resources: Resources + num_services: int + expected_instance_type: InstanceTypeType + expected_num_instances: int + + +def _assert_cluster_state( + spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int +) -> None: + assert spied_cluster_analysis.call_count > 0 + + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert ( + spied_cluster_analysis.spy_return.total_number_of_machines() + == expected_num_machines + ) + print("current cluster state:", spied_cluster_analysis.spy_return) + + +@pytest.fixture +async def mocked_associate_ec2_instances_with_nodes(mocker: MockerFixture) -> mock.Mock: + async def _( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + return [], ec2_instances + + return mocker.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.associate_ec2_instances_with_nodes", + autospec=True, + side_effect=_, + ) + + +@pytest.fixture +async def create_services_batch( + create_service: Callable[ + [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] + ], + task_template: dict[str, Any], + create_task_reservations: Callable[[int, int], dict[str, Any]], + service_monitored_labels: dict[DockerLabelKey, str], + osparc_docker_label_keys: StandardSimcoreDockerLabels, +) -> Callable[[Resources, str | None, int], Awaitable[list[Service]]]: + async def _( + service_resources: Resources, + imposed_instance_type: str | None, + num_services: int, + ) -> list[Service]: + return await asyncio.gather( + *( + create_service( + task_template + | create_task_reservations( + int(service_resources.cpus), + service_resources.ram, + ), + service_monitored_labels + | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), + "pending", + ( + [ + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={imposed_instance_type}" + ] + if imposed_instance_type + else [] + ), + ) + for _ in range(num_services) + ) + ) + + return _ + + +def _create_fake_association( + create_fake_node: Callable, + drained_machine_id: str | None, + terminating_machine_id: str | None, +): + fake_node_to_instance_map = {} + + async def _fake_node_creator( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: + if instance not in fake_node_to_instance_map: + fake_node = create_fake_node() + assert fake_node.spec + fake_node.spec.availability = Availability.active + assert fake_node.status + fake_node.status.state = NodeState.ready + assert fake_node.spec.labels + fake_node.spec.labels |= { + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), + _OSPARC_SERVICE_READY_LABEL_KEY: ( + "true" if instance.id != drained_machine_id else "false" + ), + } + if instance.id == terminating_machine_id: + fake_node.spec.labels |= { + _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY: arrow.utcnow().isoformat() + } + fake_node_to_instance_map[instance] = fake_node + return fake_node_to_instance_map[instance] + + associated_instances = [ + AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) + for i in ec2_instances + ] + + return associated_instances, [] + + return _fake_node_creator + + +async def test_cluster_scaling_with_no_services_does_nothing( + minimal_configuration: None, + app_settings: ApplicationSettings, + initialized_app: FastAPI, + mock_launch_instances: mock.Mock, + mock_terminate_instances: mock.Mock, + mock_rabbitmq_post_message: mock.Mock, +): + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + mock_launch_instances.assert_not_called() + mock_terminate_instances.assert_not_called() + _assert_rabbit_autoscaling_message_sent( + mock_rabbitmq_post_message, app_settings, initialized_app + ) + + async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expected_machines( patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, @@ -396,28 +514,6 @@ async def test_cluster_scaling_with_service_asking_for_too_much_resources_starts ) -@dataclass(frozen=True) -class _ScaleUpParams: - imposed_instance_type: str | None - service_resources: Resources - num_services: int - expected_instance_type: InstanceTypeType - expected_num_instances: int - - -def _assert_cluster_state( - spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int -) -> None: - assert spied_cluster_analysis.call_count > 0 - - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert ( - spied_cluster_analysis.spy_return.total_number_of_machines() - == expected_num_machines - ) - print("current cluster state:", spied_cluster_analysis.spy_return) - - async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 *, app_settings: ApplicationSettings, @@ -1132,102 +1228,6 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() -@pytest.fixture -async def mocked_associate_ec2_instances_with_nodes(mocker: MockerFixture) -> mock.Mock: - async def _( - nodes: list[Node], ec2_instances: list[EC2InstanceData] - ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: - return [], ec2_instances - - return mocker.patch( - "simcore_service_autoscaling.modules.auto_scaling_core.associate_ec2_instances_with_nodes", - autospec=True, - side_effect=_, - ) - - -@pytest.fixture -async def create_services_batch( - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] - ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], - service_monitored_labels: dict[DockerLabelKey, str], - osparc_docker_label_keys: StandardSimcoreDockerLabels, -) -> Callable[[Resources, str | None, int], Awaitable[list[Service]]]: - async def _( - service_resources: Resources, - imposed_instance_type: str | None, - num_services: int, - ) -> list[Service]: - return await asyncio.gather( - *( - create_service( - task_template - | create_task_reservations( - int(service_resources.cpus), - service_resources.ram, - ), - service_monitored_labels - | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={imposed_instance_type}" - ] - if imposed_instance_type - else [] - ), - ) - for _ in range(num_services) - ) - ) - - return _ - - -def _create_fake_association( - create_fake_node: Callable, - drained_machine_id: str | None, - terminating_machine_id: str | None, -): - fake_node_to_instance_map = {} - - async def _fake_node_creator( - nodes: list[Node], ec2_instances: list[EC2InstanceData] - ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: - def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: - if instance not in fake_node_to_instance_map: - fake_node = create_fake_node() - assert fake_node.spec - fake_node.spec.availability = Availability.active - assert fake_node.status - fake_node.status.state = NodeState.ready - assert fake_node.spec.labels - fake_node.spec.labels |= { - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), - _OSPARC_SERVICE_READY_LABEL_KEY: ( - "true" if instance.id != drained_machine_id else "false" - ), - } - if instance.id == terminating_machine_id: - fake_node.spec.labels |= { - _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY: arrow.utcnow().isoformat() - } - fake_node_to_instance_map[instance] = fake_node - return fake_node_to_instance_map[instance] - - associated_instances = [ - AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) - for i in ec2_instances - ] - - return associated_instances, [] - - return _fake_node_creator - - @pytest.mark.parametrize( "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True ) From c3855d4b8937ca8bced6bba0eb086c68e230fd17 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:28:09 +0100 Subject: [PATCH 18/43] re-organize more --- .../modules/auto_scaling_core.py | 9 +- ...test_modules_auto_scaling_computational.py | 194 ++++++++---------- 2 files changed, 90 insertions(+), 113 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 38a90a7de49..472b15bafd2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -16,7 +16,6 @@ Resources, ) from aws_library.ec2._errors import EC2TooManyInstancesError -from aws_library.ec2._models import AWSTagValue from fastapi import FastAPI from models_library.generated_models.docker_rest_api import Node, NodeState from servicelib.logging_utils import log_catch, log_context @@ -339,10 +338,10 @@ async def _sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]: allowed_instance_type_names ), "EC2_INSTANCES_ALLOWED_TYPES cannot be empty!" - allowed_instance_types: list[EC2InstanceType] = ( - await ec2_client.get_ec2_instance_capabilities( - cast(set[InstanceTypeType], set(allowed_instance_type_names)) - ) + allowed_instance_types: list[ + EC2InstanceType + ] = await ec2_client.get_ec2_instance_capabilities( + cast(set[InstanceTypeType], set(allowed_instance_type_names)) ) def _as_selection(instance_type: EC2InstanceType) -> int: diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 35cb04d1992..37c6c682726 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -43,7 +43,6 @@ ) from simcore_service_autoscaling.modules.dask import DaskTaskResources from simcore_service_autoscaling.modules.docker import get_docker_client -from simcore_service_autoscaling.modules.ec2 import SimcoreEC2API from simcore_service_autoscaling.utils.utils_docker import ( _OSPARC_NODE_EMPTY_DATETIME_LABEL_KEY, _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY, @@ -182,6 +181,92 @@ def ec2_instance_custom_tags( } +@pytest.fixture +def create_dask_task_resources() -> Callable[..., DaskTaskResources]: + def _do( + ec2_instance_type: InstanceTypeType | None, ram: ByteSize + ) -> DaskTaskResources: + resources = DaskTaskResources( + { + "RAM": int(ram), + } + ) + if ec2_instance_type is not None: + resources[create_ec2_resource_constraint_key(ec2_instance_type)] = 1 + return resources + + return _do + + +@pytest.fixture +def mock_dask_get_worker_has_results_in_memory(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_autoscaling.modules.dask.get_worker_still_has_results_in_memory", + return_value=0, + autospec=True, + ) + + +@pytest.fixture +def mock_dask_get_worker_used_resources(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_autoscaling.modules.dask.get_worker_used_resources", + return_value=Resources.create_as_empty(), + autospec=True, + ) + + +@pytest.fixture +def mock_dask_is_worker_connected(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_autoscaling.modules.dask.is_worker_connected", + return_value=True, + autospec=True, + ) + + +async def _create_task_with_resources( + ec2_client: EC2Client, + dask_task_imposed_ec2_type: InstanceTypeType | None, + dask_ram: ByteSize | None, + create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task: Callable[[DaskTaskResources], distributed.Future], +) -> distributed.Future: + if dask_task_imposed_ec2_type and not dask_ram: + instance_types = await ec2_client.describe_instance_types( + InstanceTypes=[dask_task_imposed_ec2_type] + ) + assert instance_types + assert "InstanceTypes" in instance_types + assert instance_types["InstanceTypes"] + assert "MemoryInfo" in instance_types["InstanceTypes"][0] + assert "SizeInMiB" in instance_types["InstanceTypes"][0]["MemoryInfo"] + dask_ram = TypeAdapter(ByteSize).validate_python( + f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", + ) + dask_task_resources = create_dask_task_resources( + dask_task_imposed_ec2_type, dask_ram + ) + dask_future = create_dask_task(dask_task_resources) + assert dask_future + return dask_future + + +@dataclass(frozen=True) +class _ScaleUpParams: + task_resources: Resources + num_tasks: int + expected_instance_type: str + expected_num_instances: int + + +def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResources: + return { + res_key.upper(): res_value + for res_key, res_value in resources.model_dump().items() + } + + async def test_cluster_scaling_with_no_tasks_does_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -259,77 +344,6 @@ async def test_cluster_scaling_with_task_with_too_much_resources_starts_nothing( ) -@pytest.fixture -def create_dask_task_resources() -> Callable[..., DaskTaskResources]: - def _do( - ec2_instance_type: InstanceTypeType | None, ram: ByteSize - ) -> DaskTaskResources: - resources = DaskTaskResources( - { - "RAM": int(ram), - } - ) - if ec2_instance_type is not None: - resources[create_ec2_resource_constraint_key(ec2_instance_type)] = 1 - return resources - - return _do - - -@pytest.fixture -def mock_dask_get_worker_has_results_in_memory(mocker: MockerFixture) -> mock.Mock: - return mocker.patch( - "simcore_service_autoscaling.modules.dask.get_worker_still_has_results_in_memory", - return_value=0, - autospec=True, - ) - - -@pytest.fixture -def mock_dask_get_worker_used_resources(mocker: MockerFixture) -> mock.Mock: - return mocker.patch( - "simcore_service_autoscaling.modules.dask.get_worker_used_resources", - return_value=Resources.create_as_empty(), - autospec=True, - ) - - -@pytest.fixture -def mock_dask_is_worker_connected(mocker: MockerFixture) -> mock.Mock: - return mocker.patch( - "simcore_service_autoscaling.modules.dask.is_worker_connected", - return_value=True, - autospec=True, - ) - - -async def _create_task_with_resources( - ec2_client: EC2Client, - dask_task_imposed_ec2_type: InstanceTypeType | None, - dask_ram: ByteSize | None, - create_dask_task_resources: Callable[..., DaskTaskResources], - create_dask_task: Callable[[DaskTaskResources], distributed.Future], -) -> distributed.Future: - if dask_task_imposed_ec2_type and not dask_ram: - instance_types = await ec2_client.describe_instance_types( - InstanceTypes=[dask_task_imposed_ec2_type] - ) - assert instance_types - assert "InstanceTypes" in instance_types - assert instance_types["InstanceTypes"] - assert "MemoryInfo" in instance_types["InstanceTypes"][0] - assert "SizeInMiB" in instance_types["InstanceTypes"][0]["MemoryInfo"] - dask_ram = TypeAdapter(ByteSize).validate_python( - f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", - ) - dask_task_resources = create_dask_task_resources( - dask_task_imposed_ec2_type, dask_ram - ) - dask_future = create_dask_task(dask_task_resources) - assert dask_future - return dask_future - - @pytest.mark.acceptance_test() @pytest.mark.parametrize( "dask_task_imposed_ec2_type, dask_ram, expected_ec2_type", @@ -809,42 +823,6 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso assert "Unexpected error:" in error_messages[0] -@dataclass(frozen=True) -class _ScaleUpParams: - task_resources: Resources - num_tasks: int - expected_instance_type: str - expected_num_instances: int - - -def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResources: - return { - res_key.upper(): res_value - for res_key, res_value in resources.model_dump().items() - } - - -@pytest.fixture -def patch_ec2_client_launch_instancess_min_number_of_instances( - mocker: MockerFixture, -) -> mock.Mock: - """the moto library always returns min number of instances instead of max number of instances which makes - it difficult to test scaling to multiple of machines. this should help""" - original_fct = SimcoreEC2API.launch_instances - - async def _change_parameters(*args, **kwargs) -> list[EC2InstanceData]: - new_kwargs = kwargs | {"min_number_of_instances": kwargs["number_of_instances"]} - print(f"patching launch_instances with: {new_kwargs}") - return await original_fct(*args, **new_kwargs) - - return mocker.patch.object( - SimcoreEC2API, - "launch_instances", - autospec=True, - side_effect=_change_parameters, - ) - - @pytest.mark.parametrize( "scale_up_params", [ From 41ee6154bfbdec036b8568241e0abb7b5f656cd1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:28:29 +0100 Subject: [PATCH 19/43] ruff --- .../tests/unit/test_modules_auto_scaling_computational.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 37c6c682726..939c08ac379 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -673,7 +673,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 fake_attached_node.spec.labels[_OSPARC_SERVICE_READY_LABEL_KEY] = "false" fake_attached_node.spec.labels[ _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY - ] = datetime.datetime.now(tz=datetime.timezone.utc).isoformat() + ] = datetime.datetime.now(tz=datetime.UTC).isoformat() # the node will be not be terminated before the timeout triggers assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -699,7 +699,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 # now changing the last update timepoint will trigger the node removal and shutdown the ec2 instance fake_attached_node.spec.labels[_OSPARC_SERVICES_READY_DATETIME_LABEL_KEY] = ( - datetime.datetime.now(tz=datetime.timezone.utc) + datetime.datetime.now(tz=datetime.UTC) - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION - datetime.timedelta(seconds=1) ).isoformat() From cc1713b4d99735ea7e5864cc2dba10d0b3e70e07 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:54:58 +0100 Subject: [PATCH 20/43] prepare test for computations --- .../tests/unit/test_modules_auto_scaling_computational.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 939c08ac379..b4c29649a8e 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -901,6 +901,13 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 + patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + minimal_configuration: None, +): + ... + + async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and_not_more( patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, From 5c6df69544555370745d5392ca379c311b174935 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:54:41 +0100 Subject: [PATCH 21/43] incompatible with osparc.io --- .../computational-clusters/autoscaled_monitor/core.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py index c0c4ba7bed6..540b4581ab6 100755 --- a/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py +++ b/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py @@ -138,7 +138,6 @@ def _print_dynamic_instances( f"{utils.color_encode_with_state(instance.name, instance.ec2_instance)}", f"ID: {instance.ec2_instance.instance_id}", f"AMI: {instance.ec2_instance.image_id}", - f"AMI name: {instance.ec2_instance.image.name}", f"Type: {instance.ec2_instance.instance_type}", f"Up: {utils.timedelta_formatting(time_now - instance.ec2_instance.launch_time, color_code=True)}", f"ExtIP: {instance.ec2_instance.public_ip_address}", @@ -183,7 +182,6 @@ def _print_computational_clusters( f"Name: {cluster.primary.name}", f"ID: {cluster.primary.ec2_instance.id}", f"AMI: {cluster.primary.ec2_instance.image_id}", - f"AMI name: {cluster.primary.ec2_instance.image.name}", f"Type: {cluster.primary.ec2_instance.instance_type}", f"Up: {utils.timedelta_formatting(time_now - cluster.primary.ec2_instance.launch_time, color_code=True)}", f"ExtIP: {cluster.primary.ec2_instance.public_ip_address}", @@ -229,7 +227,6 @@ def _print_computational_clusters( f"Name: {worker.name}", f"ID: {worker.ec2_instance.id}", f"AMI: {worker.ec2_instance.image_id}", - f"AMI name: {worker.ec2_instance.image.name}", f"Type: {worker.ec2_instance.instance_type}", f"Up: {utils.timedelta_formatting(time_now - worker.ec2_instance.launch_time, color_code=True)}", f"ExtIP: {worker.ec2_instance.public_ip_address}", From 2007b8becc194c88a7c3662ab8b2ec46abbdd965 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:14:41 +0100 Subject: [PATCH 22/43] refactoring --- ...test_modules_auto_scaling_computational.py | 119 ++++++++++-------- .../unit/test_modules_auto_scaling_dynamic.py | 7 +- 2 files changed, 73 insertions(+), 53 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index b4c29649a8e..1616d9db58b 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -184,11 +184,11 @@ def ec2_instance_custom_tags( @pytest.fixture def create_dask_task_resources() -> Callable[..., DaskTaskResources]: def _do( - ec2_instance_type: InstanceTypeType | None, ram: ByteSize + ec2_instance_type: InstanceTypeType | None, task_resource: Resources ) -> DaskTaskResources: resources = DaskTaskResources( { - "RAM": int(ram), + "RAM": int(task_resource.ram), } ) if ec2_instance_type is not None: @@ -228,11 +228,11 @@ def mock_dask_is_worker_connected(mocker: MockerFixture) -> mock.Mock: async def _create_task_with_resources( ec2_client: EC2Client, dask_task_imposed_ec2_type: InstanceTypeType | None, - dask_ram: ByteSize | None, + task_resources: Resources | None, create_dask_task_resources: Callable[..., DaskTaskResources], create_dask_task: Callable[[DaskTaskResources], distributed.Future], ) -> distributed.Future: - if dask_task_imposed_ec2_type and not dask_ram: + if dask_task_imposed_ec2_type and not task_resources: instance_types = await ec2_client.describe_instance_types( InstanceTypes=[dask_task_imposed_ec2_type] ) @@ -241,11 +241,15 @@ async def _create_task_with_resources( assert instance_types["InstanceTypes"] assert "MemoryInfo" in instance_types["InstanceTypes"][0] assert "SizeInMiB" in instance_types["InstanceTypes"][0]["MemoryInfo"] - dask_ram = TypeAdapter(ByteSize).validate_python( - f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", + task_resources = Resources( + cpus=1, + ram=TypeAdapter(ByteSize).validate_python( + f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", + ), ) + dask_task_resources = create_dask_task_resources( - dask_task_imposed_ec2_type, dask_ram + dask_task_imposed_ec2_type, task_resources ) dask_future = create_dask_task(dask_task_resources) assert dask_future @@ -254,9 +258,10 @@ async def _create_task_with_resources( @dataclass(frozen=True) class _ScaleUpParams: - task_resources: Resources + imposed_instance_type: InstanceTypeType | None + task_resources: Resources | None num_tasks: int - expected_instance_type: str + expected_instance_type: InstanceTypeType expected_num_instances: int @@ -344,32 +349,52 @@ async def test_cluster_scaling_with_task_with_too_much_resources_starts_nothing( ) -@pytest.mark.acceptance_test() +@pytest.mark.acceptance_test @pytest.mark.parametrize( - "dask_task_imposed_ec2_type, dask_ram, expected_ec2_type", + "scale_up_params", [ pytest.param( - None, - TypeAdapter(ByteSize).validate_python("128Gib"), - "r5n.4xlarge", + _ScaleUpParams( + imposed_instance_type=None, + task_resources=Resources( + cpus=1, ram=TypeAdapter(ByteSize).validate_python("128Gib") + ), + num_tasks=1, + expected_instance_type="r5n.4xlarge", + expected_num_instances=1, + ), id="No explicit instance defined", ), pytest.param( - "g4dn.2xlarge", - None, - "g4dn.2xlarge", + _ScaleUpParams( + imposed_instance_type="g4dn.2xlarge", + task_resources=None, + num_tasks=1, + expected_instance_type="g4dn.2xlarge", + expected_num_instances=1, + ), id="Explicitely ask for g4dn.2xlarge and use all the resources", ), pytest.param( - "r5n.8xlarge", - TypeAdapter(ByteSize).validate_python("116Gib"), - "r5n.8xlarge", + _ScaleUpParams( + imposed_instance_type="r5n.8xlarge", + task_resources=Resources( + cpus=1, ram=TypeAdapter(ByteSize).validate_python("116Gib") + ), + num_tasks=1, + expected_instance_type="r5n.8xlarge", + expected_num_instances=1, + ), id="Explicitely ask for r5n.8xlarge and set the resources", ), pytest.param( - "r5n.8xlarge", - None, - "r5n.8xlarge", + _ScaleUpParams( + imposed_instance_type="r5n.8xlarge", + task_resources=None, + num_tasks=1, + expected_instance_type="r5n.8xlarge", + expected_num_instances=1, + ), id="Explicitely ask for r5n.8xlarge and use all the resources", ), ], @@ -392,11 +417,9 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mocker: MockerFixture, dask_spec_local_cluster: distributed.SpecCluster, create_dask_task_resources: Callable[..., DaskTaskResources], - dask_task_imposed_ec2_type: InstanceTypeType | None, - dask_ram: ByteSize | None, - expected_ec2_type: InstanceTypeType, with_drain_nodes_labelled: bool, ec2_instance_custom_tags: dict[str, str], + scale_up_params: _ScaleUpParams, ): # we have nothing running now all_instances = await ec2_client.describe_instances() @@ -405,8 +428,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 # create a task that needs more power dask_future = await _create_task_with_resources( ec2_client, - dask_task_imposed_ec2_type, - dask_ram, + scale_up_params.imposed_instance_type, + scale_up_params.task_resources, create_dask_task_resources, create_dask_task, ) @@ -420,8 +443,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -457,8 +480,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 instances = await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -470,7 +493,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_docker_find_node_with_name_returns_fake_node.assert_called_once() mock_docker_find_node_with_name_returns_fake_node.reset_mock() expected_docker_node_tags = { - DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY: expected_ec2_type + DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY: scale_up_params.expected_instance_type } assert mock_docker_tag_node.call_count == 3 assert fake_node.spec @@ -539,7 +562,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 # now we have 1 monitored node that needs to be mocked fake_attached_node.spec.labels[_OSPARC_SERVICE_READY_LABEL_KEY] = "true" fake_attached_node.status = NodeStatus( - State=NodeState.ready, Message=None, Addr=None + state=NodeState.ready, message=None, addr=None ) fake_attached_node.spec.availability = Availability.active assert fake_attached_node.description @@ -578,8 +601,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -662,8 +685,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -691,8 +714,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -709,8 +732,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -741,8 +764,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="terminated", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -828,6 +851,7 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso [ pytest.param( _ScaleUpParams( + imposed_instance_type=None, task_resources=Resources( cpus=5, ram=TypeAdapter(ByteSize).validate_python("36Gib") ), @@ -901,13 +925,6 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() -async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, - minimal_configuration: None, -): - ... - - async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and_not_more( patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 5dfdb0045e5..12d13412f73 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -244,7 +244,7 @@ async def spied_cluster_analysis(mocker: MockerFixture) -> MockType: @dataclass(frozen=True) class _ScaleUpParams: - imposed_instance_type: str | None + imposed_instance_type: InstanceTypeType | None service_resources: Resources num_services: int expected_instance_type: InstanceTypeType @@ -730,7 +730,7 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: # now we have 1 monitored node that needs to be mocked fake_attached_node.spec.labels[_OSPARC_SERVICE_READY_LABEL_KEY] = "true" fake_attached_node.status = NodeStatus( - State=NodeState.ready, Message=None, Addr=None + state=NodeState.ready, message=None, addr=None ) fake_attached_node.spec.availability = Availability.active fake_attached_node.description.hostname = internal_dns_name @@ -1492,6 +1492,9 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 for instance in reservation1["Instances"]: assert "InstanceType" in instance assert instance["InstanceType"] == scale_up_params1.expected_instance_type + assert "InstanceId" in instance + assert "State" in instance + assert "Name" in instance["State"] if instance["InstanceId"] == drained_machine_instance_id: assert instance["State"]["Name"] == "terminated" else: From d1e0615e7b31524704748190351ff6a330b06012 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:23:41 +0100 Subject: [PATCH 23/43] cleanup --- ...test_modules_auto_scaling_computational.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 1616d9db58b..0dc6b5624d2 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -182,7 +182,9 @@ def ec2_instance_custom_tags( @pytest.fixture -def create_dask_task_resources() -> Callable[..., DaskTaskResources]: +def create_dask_task_resources() -> ( + Callable[[InstanceTypeType | None, Resources], DaskTaskResources] +): def _do( ec2_instance_type: InstanceTypeType | None, task_resource: Resources ) -> DaskTaskResources: @@ -229,7 +231,9 @@ async def _create_task_with_resources( ec2_client: EC2Client, dask_task_imposed_ec2_type: InstanceTypeType | None, task_resources: Resources | None, - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], create_dask_task: Callable[[DaskTaskResources], distributed.Future], ) -> distributed.Future: if dask_task_imposed_ec2_type and not task_resources: @@ -416,7 +420,9 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_dask_is_worker_connected: mock.Mock, mocker: MockerFixture, dask_spec_local_cluster: distributed.SpecCluster, - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], with_drain_nodes_labelled: bool, ec2_instance_custom_tags: dict[str, str], scale_up_params: _ScaleUpParams, @@ -779,7 +785,9 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_allowed( app_settings: ApplicationSettings, initialized_app: FastAPI, create_dask_task: Callable[[DaskTaskResources], distributed.Future], - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], ec2_client: EC2Client, faker: Faker, caplog: pytest.LogCaptureFixture, @@ -790,7 +798,8 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_allowed( # create a task that needs more power dask_task_resources = create_dask_task_resources( - faker.pystr(), TypeAdapter(ByteSize).validate_python("128GiB") + faker.pystr(), + Resources(cpus=1, ram=TypeAdapter(ByteSize).validate_python("128GiB")), ) dask_future = create_dask_task(dask_task_resources) assert dask_future @@ -815,7 +824,9 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso app_settings: ApplicationSettings, initialized_app: FastAPI, create_dask_task: Callable[[DaskTaskResources], distributed.Future], - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], ec2_client: EC2Client, faker: Faker, caplog: pytest.LogCaptureFixture, @@ -826,7 +837,8 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso # create a task that needs more power dask_task_resources = create_dask_task_resources( - "t2.xlarge", TypeAdapter(ByteSize).validate_python("128GiB") + "t2.xlarge", + Resources(cpus=1, ram=TypeAdapter(ByteSize).validate_python("128GiB")), ) dask_future = create_dask_task(dask_task_resources) assert dask_future @@ -933,7 +945,9 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and create_dask_task: Callable[[DaskTaskResources], distributed.Future], ec2_client: EC2Client, dask_spec_local_cluster: distributed.SpecCluster, - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], mock_docker_tag_node: mock.Mock, mock_rabbitmq_post_message: mock.Mock, mock_docker_find_node_with_name_returns_fake_node: mock.Mock, @@ -1021,7 +1035,9 @@ async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_star create_dask_task: Callable[[DaskTaskResources], distributed.Future], ec2_client: EC2Client, dask_spec_local_cluster: distributed.SpecCluster, - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], mock_docker_tag_node: mock.Mock, mock_rabbitmq_post_message: mock.Mock, mock_docker_find_node_with_name_returns_fake_node: mock.Mock, @@ -1128,7 +1144,9 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( ec2_client: EC2Client, dask_task_imposed_ec2_type: InstanceTypeType | None, dask_ram: ByteSize | None, - create_dask_task_resources: Callable[..., DaskTaskResources], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], dask_spec_local_cluster: distributed.SpecCluster, expected_ec2_type: InstanceTypeType, mock_find_node_with_name_returns_none: mock.Mock, @@ -1149,7 +1167,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( dask_future = await _create_task_with_resources( ec2_client, dask_task_imposed_ec2_type, - dask_ram, + Resources(cpus=1, ram=dask_ram), create_dask_task_resources, create_dask_task, ) From fe12f8c869bcbcc0feca711c9d511bc16760982d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:44:57 +0100 Subject: [PATCH 24/43] refactor --- ...test_modules_auto_scaling_computational.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 0dc6b5624d2..3dca7c08794 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -1125,12 +1125,18 @@ async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_star @pytest.mark.parametrize( - "dask_task_imposed_ec2_type, dask_ram, expected_ec2_type", + "scale_up_params", [ pytest.param( - None, - TypeAdapter(ByteSize).validate_python("128Gib"), - "r5n.4xlarge", + _ScaleUpParams( + imposed_instance_type=None, + task_resources=Resources( + cpus=1, ram=TypeAdapter(ByteSize).validate_python("128Gib") + ), + num_tasks=1, + expected_instance_type="r5n.4xlarge", + expected_num_instances=1, + ), id="No explicit instance defined", ), ], @@ -1142,18 +1148,16 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( initialized_app: FastAPI, create_dask_task: Callable[[DaskTaskResources], distributed.Future], ec2_client: EC2Client, - dask_task_imposed_ec2_type: InstanceTypeType | None, - dask_ram: ByteSize | None, create_dask_task_resources: Callable[ [InstanceTypeType | None, Resources], DaskTaskResources ], dask_spec_local_cluster: distributed.SpecCluster, - expected_ec2_type: InstanceTypeType, mock_find_node_with_name_returns_none: mock.Mock, mock_docker_tag_node: mock.Mock, mock_rabbitmq_post_message: mock.Mock, short_ec2_instance_max_start_time: datetime.timedelta, ec2_instance_custom_tags: dict[str, str], + scale_up_params: _ScaleUpParams, ): assert app_settings.AUTOSCALING_EC2_INSTANCES assert ( @@ -1166,8 +1170,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( # create a task that needs more power dask_future = await _create_task_with_resources( ec2_client, - dask_task_imposed_ec2_type, - Resources(cpus=1, ram=dask_ram), + scale_up_params.imposed_instance_type, + scale_up_params.task_resources, create_dask_task_resources, create_dask_task, ) @@ -1181,8 +1185,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( instances = await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -1224,8 +1228,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( instances = await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -1237,7 +1241,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( initialized_app, dask_spec_local_cluster.scheduler_address, instances_running=0, - instances_pending=1, + instances_pending=scale_up_params.expected_num_instances, ) mock_rabbitmq_post_message.reset_mock() assert instances @@ -1268,7 +1272,10 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( all_instances = await ec2_client.describe_instances() assert len(all_instances["Reservations"]) == 2 assert "Instances" in all_instances["Reservations"][0] - assert len(all_instances["Reservations"][0]["Instances"]) == 1 + assert ( + len(all_instances["Reservations"][0]["Instances"]) + == scale_up_params.expected_num_instances + ) assert "State" in all_instances["Reservations"][0]["Instances"][0] assert "Name" in all_instances["Reservations"][0]["Instances"][0]["State"] assert ( @@ -1277,7 +1284,10 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( ) assert "Instances" in all_instances["Reservations"][1] - assert len(all_instances["Reservations"][1]["Instances"]) == 1 + assert ( + len(all_instances["Reservations"][1]["Instances"]) + == scale_up_params.expected_num_instances + ) assert "State" in all_instances["Reservations"][1]["Instances"][0] assert "Name" in all_instances["Reservations"][1]["Instances"][0]["State"] assert ( From 40bd952aebfa194808bf5860a9ebacea12c78d0d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:22:16 +0100 Subject: [PATCH 25/43] refactor --- ...test_modules_auto_scaling_computational.py | 104 ++++++++++-------- .../unit/test_modules_auto_scaling_dynamic.py | 2 +- 2 files changed, 62 insertions(+), 44 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 3dca7c08794..f3d327bbac7 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -14,7 +14,7 @@ from collections.abc import Callable, Iterator from copy import deepcopy from dataclasses import dataclass -from typing import Any +from typing import Any, Awaitable from unittest import mock import arrow @@ -188,11 +188,7 @@ def create_dask_task_resources() -> ( def _do( ec2_instance_type: InstanceTypeType | None, task_resource: Resources ) -> DaskTaskResources: - resources = DaskTaskResources( - { - "RAM": int(task_resource.ram), - } - ) + resources = _dask_task_resources_from_resources(task_resource) if ec2_instance_type is not None: resources[create_ec2_resource_constraint_key(ec2_instance_type)] = 1 return resources @@ -260,7 +256,7 @@ async def _create_task_with_resources( return dask_future -@dataclass(frozen=True) +@dataclass(kw_only=True) class _ScaleUpParams: imposed_instance_type: InstanceTypeType | None task_resources: Resources | None @@ -276,6 +272,31 @@ def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResourc } +@pytest.fixture +async def create_tasks_batch( + ec2_client: EC2Client, + create_dask_task: Callable[[DaskTaskResources], distributed.Future], + create_dask_task_resources: Callable[ + [InstanceTypeType | None, Resources], DaskTaskResources + ], +) -> Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]]: + async def _(scale_up_params: _ScaleUpParams) -> list[distributed.Future]: + return await asyncio.gather( + *( + _create_task_with_resources( + ec2_client, + scale_up_params.imposed_instance_type, + scale_up_params.task_resources, + create_dask_task_resources, + create_dask_task, + ) + for _ in range(scale_up_params.num_tasks) + ) + ) + + return _ + + async def test_cluster_scaling_with_no_tasks_does_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -880,7 +901,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_dask_task: Callable[[DaskTaskResources], distributed.Future], + create_tasks_batch: Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, scale_up_params: _ScaleUpParams, @@ -895,16 +916,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( assert not all_instances["Reservations"] # create several tasks that needs more power - dask_futures = await asyncio.gather( - *( - asyncio.get_event_loop().run_in_executor( - None, - create_dask_task, - _dask_task_resources_from_resources(scale_up_params.task_resources), - ) - for _ in range(scale_up_params.num_tasks) - ) - ) + dask_futures = await create_tasks_batch(scale_up_params) assert dask_futures # run the code @@ -917,7 +929,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( ec2_client, expected_num_reservations=1, expected_num_instances=scale_up_params.expected_num_instances, - expected_instance_type="g3.4xlarge", + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -937,17 +949,29 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +@pytest.mark.parametrize( + "scale_up_params", + [ + pytest.param( + _ScaleUpParams( + imposed_instance_type="r5n.8xlarge", + task_resources=None, + num_tasks=1, + expected_instance_type="r5n.8xlarge", + expected_num_instances=1, + ), + id="Impose r5n.8xlarge without resources", + ), + ], +) async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and_not_more( patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_dask_task: Callable[[DaskTaskResources], distributed.Future], + create_tasks_batch: Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]], ec2_client: EC2Client, dask_spec_local_cluster: distributed.SpecCluster, - create_dask_task_resources: Callable[ - [InstanceTypeType | None, Resources], DaskTaskResources - ], mock_docker_tag_node: mock.Mock, mock_rabbitmq_post_message: mock.Mock, mock_docker_find_node_with_name_returns_fake_node: mock.Mock, @@ -956,29 +980,23 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and mock_dask_get_worker_has_results_in_memory: mock.Mock, mock_dask_get_worker_used_resources: mock.Mock, ec2_instance_custom_tags: dict[str, str], + scale_up_params: _ScaleUpParams, ): - ec2_instance_type = "r5n.8xlarge" - # we have nothing running now all_instances = await ec2_client.describe_instances() assert not all_instances["Reservations"] assert app_settings.AUTOSCALING_EC2_INSTANCES assert app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES > 0 - num_tasks = 3 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + # override the number of tasks + scale_up_params.num_tasks = ( + 3 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ) + scale_up_params.expected_num_instances = ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ) # create the tasks - task_futures = await asyncio.gather( - *( - _create_task_with_resources( - ec2_client, - ec2_instance_type, - None, - create_dask_task_resources, - create_dask_task, - ) - for _ in range(num_tasks) - ) - ) + task_futures = await create_tasks_batch(scale_up_params) assert all(task_futures) # this should trigger a scaling up as we have no nodes @@ -988,8 +1006,8 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, - expected_instance_type=ec2_instance_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) @@ -1007,7 +1025,7 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and initialized_app, dask_spec_local_cluster.scheduler_address, instances_running=0, - instances_pending=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + instances_pending=scale_up_params.expected_num_instances, ) mock_rabbitmq_post_message.reset_mock() @@ -1020,8 +1038,8 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and await assert_autoscaled_computational_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, - expected_instance_type=ec2_instance_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), ) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 12d13412f73..753d25f9895 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -290,7 +290,7 @@ async def create_services_batch( ) -> Callable[[Resources, str | None, int], Awaitable[list[Service]]]: async def _( service_resources: Resources, - imposed_instance_type: str | None, + imposed_instance_type: InstanceTypeType | None, num_services: int, ) -> list[Service]: return await asyncio.gather( From 9f20912d25ee7f3f7c47be3214b04cd7e446b38c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:31:19 +0100 Subject: [PATCH 26/43] ready to add the new test --- ...test_modules_auto_scaling_computational.py | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index f3d327bbac7..8115a8e4363 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -14,7 +14,7 @@ from collections.abc import Callable, Iterator from copy import deepcopy from dataclasses import dataclass -from typing import Any, Awaitable +from typing import Any, Awaitable, cast from unittest import mock import arrow @@ -248,6 +248,7 @@ async def _create_task_with_resources( ), ) + assert task_resources dask_task_resources = create_dask_task_resources( dask_task_imposed_ec2_type, task_resources ) @@ -428,7 +429,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_dask_task: Callable[[DaskTaskResources], distributed.Future], + create_tasks_batch: Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: DockerNode, @@ -441,9 +442,6 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_dask_is_worker_connected: mock.Mock, mocker: MockerFixture, dask_spec_local_cluster: distributed.SpecCluster, - create_dask_task_resources: Callable[ - [InstanceTypeType | None, Resources], DaskTaskResources - ], with_drain_nodes_labelled: bool, ec2_instance_custom_tags: dict[str, str], scale_up_params: _ScaleUpParams, @@ -453,14 +451,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 assert not all_instances["Reservations"] # create a task that needs more power - dask_future = await _create_task_with_resources( - ec2_client, - scale_up_params.imposed_instance_type, - scale_up_params.task_resources, - create_dask_task_resources, - create_dask_task, - ) - + dask_futures = await create_tasks_batch(scale_up_params) + assert dask_futures # this should trigger a scaling up as we have no nodes await auto_scale_cluster( app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() @@ -642,7 +634,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 # # 4. now scaling down, as we deleted all the tasks # - del dask_future + del dask_futures await auto_scale_cluster(app=initialized_app, auto_scaling_mode=auto_scaling_mode) mock_dask_is_worker_connected.assert_called_once() mock_dask_is_worker_connected.reset_mock() @@ -819,7 +811,7 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_allowed( # create a task that needs more power dask_task_resources = create_dask_task_resources( - faker.pystr(), + cast(InstanceTypeType, faker.pystr()), Resources(cpus=1, ram=TypeAdapter(ByteSize).validate_python("128GiB")), ) dask_future = create_dask_task(dask_task_resources) @@ -1164,11 +1156,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_dask_task: Callable[[DaskTaskResources], distributed.Future], + create_tasks_batch: Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]], ec2_client: EC2Client, - create_dask_task_resources: Callable[ - [InstanceTypeType | None, Resources], DaskTaskResources - ], dask_spec_local_cluster: distributed.SpecCluster, mock_find_node_with_name_returns_none: mock.Mock, mock_docker_tag_node: mock.Mock, @@ -1186,14 +1175,9 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( all_instances = await ec2_client.describe_instances() assert not all_instances["Reservations"] # create a task that needs more power - dask_future = await _create_task_with_resources( - ec2_client, - scale_up_params.imposed_instance_type, - scale_up_params.task_resources, - create_dask_task_resources, - create_dask_task, - ) - assert dask_future + dask_futures = await create_tasks_batch(scale_up_params) + assert dask_futures + # this should trigger a scaling up as we have no nodes await auto_scale_cluster( app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() From 66a99aed662e20def5b08110bc4bb27305656d8f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:24:39 +0100 Subject: [PATCH 27/43] move fixture down --- services/autoscaling/tests/unit/conftest.py | 7 +++++++ .../tests/unit/test_modules_auto_scaling_dynamic.py | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 282a321ea6c..1d676b404f2 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -46,6 +46,7 @@ TaskSpec, ) from pydantic import ByteSize, PositiveInt, TypeAdapter +from pytest_mock import MockType from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.host import get_localhost_ip from pytest_simcore.helpers.logging_tools import log_context @@ -69,6 +70,7 @@ Cluster, DaskTaskResources, ) +from simcore_service_autoscaling.modules import auto_scaling_core from simcore_service_autoscaling.modules.docker import AutoscalingDocker from simcore_service_autoscaling.modules.ec2 import SimcoreEC2API from simcore_service_autoscaling.utils.utils_docker import ( @@ -1002,3 +1004,8 @@ def with_short_ec2_instances_max_start_time( "EC2_INSTANCES_MAX_START_TIME": f"{short_ec2_instance_max_start_time}", }, ) + + +@pytest.fixture +async def spied_cluster_analysis(mocker: MockerFixture) -> MockType: + return mocker.spy(auto_scaling_core, "_analyze_current_cluster") diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 753d25f9895..00dd5991a18 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -44,7 +44,6 @@ from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict from simcore_service_autoscaling.core.settings import ApplicationSettings from simcore_service_autoscaling.models import AssociatedInstance, Cluster -from simcore_service_autoscaling.modules import auto_scaling_core from simcore_service_autoscaling.modules.auto_scaling_core import ( _activate_drained_nodes, _find_terminateable_instances, @@ -237,11 +236,6 @@ def instance_type_filters( ] -@pytest.fixture -async def spied_cluster_analysis(mocker: MockerFixture) -> MockType: - return mocker.spy(auto_scaling_core, "_analyze_current_cluster") - - @dataclass(frozen=True) class _ScaleUpParams: imposed_instance_type: InstanceTypeType | None From 31308cb2168219f110dbacf149db043e51e99f4a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:56:02 +0100 Subject: [PATCH 28/43] preparing test --- services/autoscaling/tests/unit/conftest.py | 2 +- ...test_modules_auto_scaling_computational.py | 116 ++++++++++++++++-- .../unit/test_modules_auto_scaling_dynamic.py | 6 +- 3 files changed, 113 insertions(+), 11 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 1d676b404f2..416b1d5ba5a 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -892,7 +892,7 @@ async def fake_tag_node( @pytest.fixture -def patch_ec2_client_launch_instancess_min_number_of_instances( +def patch_ec2_client_launch_instances_min_number_of_instances( mocker: MockerFixture, ) -> mock.Mock: """the moto library always returns min number of instances instead of max number of instances which makes diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 8115a8e4363..7d5e050e052 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -11,10 +11,10 @@ import datetime import logging from collections import defaultdict -from collections.abc import Callable, Iterator +from collections.abc import Awaitable, Callable, Iterator from copy import deepcopy from dataclasses import dataclass -from typing import Any, Awaitable, cast +from typing import Any, cast from unittest import mock import arrow @@ -32,11 +32,11 @@ from models_library.generated_models.docker_rest_api import NodeState, NodeStatus from models_library.rabbitmq_messages import RabbitAutoscalingStatusMessage from pydantic import ByteSize, TypeAdapter -from pytest_mock import MockerFixture +from pytest_mock import MockerFixture, MockType from pytest_simcore.helpers.aws_ec2 import assert_autoscaled_computational_ec2_instances from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict from simcore_service_autoscaling.core.settings import ApplicationSettings -from simcore_service_autoscaling.models import EC2InstanceData +from simcore_service_autoscaling.models import Cluster, EC2InstanceData from simcore_service_autoscaling.modules.auto_scaling_core import auto_scale_cluster from simcore_service_autoscaling.modules.auto_scaling_mode_computational import ( ComputationalAutoscaling, @@ -266,6 +266,19 @@ class _ScaleUpParams: expected_num_instances: int +def _assert_cluster_state( + spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int +) -> None: + assert spied_cluster_analysis.call_count > 0 + + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert ( + spied_cluster_analysis.spy_return.total_number_of_machines() + == expected_num_machines + ) + print("current cluster state:", spied_cluster_analysis.spy_return) + + def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResources: return { res_key.upper(): res_value @@ -889,7 +902,7 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso ], ) async def test_cluster_scaling_up_starts_multiple_instances( - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, @@ -957,7 +970,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( ], ) async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and_not_more( - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, @@ -1038,7 +1051,7 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_starts_max_instances_and_not_more( - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, @@ -1295,3 +1308,92 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( assert ( all_instances["Reservations"][1]["Instances"][0]["State"]["Name"] == "running" ) + + +@pytest.mark.parametrize( + "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True +) +@pytest.mark.parametrize( + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) +@pytest.mark.parametrize( + "scale_up_params1, scale_up_params2", + [ + pytest.param( + _ScaleUpParams( + imposed_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB + task_resources=Resources( + cpus=16, ram=TypeAdapter(ByteSize).validate_python("30Gib") + ), + num_tasks=12, + expected_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB + expected_num_instances=10, + ), + _ScaleUpParams( + imposed_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB + task_resources=Resources( + cpus=32, ram=TypeAdapter(ByteSize).validate_python("20480MB") + ), + num_tasks=7, + expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB + expected_num_instances=7, + ), + id="A batch of services requiring g3.4xlarge and a batch requiring g4dn.8xlarge", + ), + ], +) +async def test_cluster_adapts_machines_on_the_fly( + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, + minimal_configuration: None, + ec2_client: EC2Client, + initialized_app: FastAPI, + app_settings: ApplicationSettings, + create_tasks_batch: Callable[[_ScaleUpParams], Awaitable[list[distributed.Future]]], + ec2_instance_custom_tags: dict[str, str], + scale_up_params1: _ScaleUpParams, + scale_up_params2: _ScaleUpParams, + mock_docker_tag_node: mock.Mock, + spied_cluster_analysis: MockType, + mocker: MockerFixture, +): + # pre-requisites + assert app_settings.AUTOSCALING_EC2_INSTANCES + assert app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES > 0 + assert ( + scale_up_params1.num_tasks + >= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ), "this test requires to run a first batch of more services than the maximum number of instances allowed" + # we have nothing running now + all_instances = await ec2_client.describe_instances() + assert not all_instances["Reservations"] + + # + # 1. create the first batch of services requiring the initial machines + first_batch_tasks = await create_tasks_batch(scale_up_params1) + assert first_batch_tasks + + # it will only scale once and do nothing else + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + await assert_autoscaled_computational_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + ) + + _assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=0, + ) + # mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) + # mocked_associate_ec2_instances_with_nodes.reset_mock() + # mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + # create_fake_node, None, None + # ) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 00dd5991a18..8a1d6f2300f 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -373,7 +373,7 @@ async def test_cluster_scaling_with_no_services_does_nothing( async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expected_machines( - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, mock_machines_buffer: int, app_settings: ApplicationSettings, @@ -1165,7 +1165,7 @@ async def test_cluster_scaling_up_and_down_against_aws( ], ) async def test_cluster_scaling_up_starts_multiple_instances( - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, @@ -1257,7 +1257,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( ], ) async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 - patch_ec2_client_launch_instancess_min_number_of_instances: mock.Mock, + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, ec2_client: EC2Client, initialized_app: FastAPI, From cc6bb77e59f7703b1f8b1c07d283453050b6a698 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:01:07 +0100 Subject: [PATCH 29/43] fixed usage of resources --- .../tests/unit/test_modules_auto_scaling_computational.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 7d5e050e052..e8f0207a58c 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -14,7 +14,7 @@ from collections.abc import Awaitable, Callable, Iterator from copy import deepcopy from dataclasses import dataclass -from typing import Any, cast +from typing import Any, Final, cast from unittest import mock import arrow @@ -279,9 +279,12 @@ def _assert_cluster_state( print("current cluster state:", spied_cluster_analysis.spy_return) +_RESOURCE_TO_DASK_RESOURCE_MAP: Final[dict[str, str]] = {"CPUS": "CPU", "RAM": "RAM"} + + def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResources: return { - res_key.upper(): res_value + _RESOURCE_TO_DASK_RESOURCE_MAP[res_key.upper()]: res_value for res_key, res_value in resources.model_dump().items() } From 45eb79a55bd15a00910f7e8710e39771696ee2f8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:26:06 +0100 Subject: [PATCH 30/43] move fixtures down --- .../src/pytest_simcore/helpers/autoscaling.py | 70 +++++++++++++++ services/autoscaling/tests/unit/conftest.py | 14 +++ .../unit/test_modules_auto_scaling_dynamic.py | 90 +++---------------- 3 files changed, 97 insertions(+), 77 deletions(-) create mode 100644 packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py new file mode 100644 index 00000000000..4ecc889a96c --- /dev/null +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py @@ -0,0 +1,70 @@ +from collections.abc import Callable + +import arrow +from aws_library.ec2 import EC2InstanceData +from models_library.generated_models.docker_rest_api import ( + Availability, + Node, + NodeState, +) +from pytest_mock import MockType +from simcore_service_autoscaling.models import AssociatedInstance, Cluster +from simcore_service_autoscaling.utils.utils_docker import ( + _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY, + _OSPARC_SERVICE_READY_LABEL_KEY, + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY, +) + + +def assert_cluster_state( + spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int +) -> None: + assert spied_cluster_analysis.call_count > 0 + + assert isinstance(spied_cluster_analysis.spy_return, Cluster) + assert ( + spied_cluster_analysis.spy_return.total_number_of_machines() + == expected_num_machines + ) + print("current cluster state:", spied_cluster_analysis.spy_return) + + +def create_fake_association( + create_fake_node: Callable[..., Node], + drained_machine_id: str | None, + terminating_machine_id: str | None, +): + fake_node_to_instance_map = {} + + async def _fake_node_creator( + nodes: list[Node], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: + if instance not in fake_node_to_instance_map: + fake_node = create_fake_node() + assert fake_node.spec + fake_node.spec.availability = Availability.active + assert fake_node.status + fake_node.status.state = NodeState.ready + assert fake_node.spec.labels + fake_node.spec.labels |= { + _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), + _OSPARC_SERVICE_READY_LABEL_KEY: ( + "true" if instance.id != drained_machine_id else "false" + ), + } + if instance.id == terminating_machine_id: + fake_node.spec.labels |= { + _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY: arrow.utcnow().isoformat() + } + fake_node_to_instance_map[instance] = fake_node + return fake_node_to_instance_map[instance] + + associated_instances = [ + AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) + for i in ec2_instances + ] + + return associated_instances, [] + + return _fake_node_creator diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 416b1d5ba5a..4ec042ce65f 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -1009,3 +1009,17 @@ def with_short_ec2_instances_max_start_time( @pytest.fixture async def spied_cluster_analysis(mocker: MockerFixture) -> MockType: return mocker.spy(auto_scaling_core, "_analyze_current_cluster") + + +@pytest.fixture +async def mocked_associate_ec2_instances_with_nodes(mocker: MockerFixture) -> mock.Mock: + async def _( + nodes: list[DockerNode], ec2_instances: list[EC2InstanceData] + ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: + return [], ec2_instances + + return mocker.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.associate_ec2_instances_with_nodes", + autospec=True, + side_effect=_, + ) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 8a1d6f2300f..10414a2f0de 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -39,6 +39,10 @@ from pydantic import ByteSize, TypeAdapter from pytest_mock import MockType from pytest_mock.plugin import MockerFixture +from pytest_simcore.helpers.autoscaling import ( + assert_cluster_state, + create_fake_association, +) from pytest_simcore.helpers.aws_ec2 import assert_autoscaled_dynamic_ec2_instances from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict @@ -245,33 +249,6 @@ class _ScaleUpParams: expected_num_instances: int -def _assert_cluster_state( - spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int -) -> None: - assert spied_cluster_analysis.call_count > 0 - - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert ( - spied_cluster_analysis.spy_return.total_number_of_machines() - == expected_num_machines - ) - print("current cluster state:", spied_cluster_analysis.spy_return) - - -@pytest.fixture -async def mocked_associate_ec2_instances_with_nodes(mocker: MockerFixture) -> mock.Mock: - async def _( - nodes: list[Node], ec2_instances: list[EC2InstanceData] - ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: - return [], ec2_instances - - return mocker.patch( - "simcore_service_autoscaling.modules.auto_scaling_core.associate_ec2_instances_with_nodes", - autospec=True, - side_effect=_, - ) - - @pytest.fixture async def create_services_batch( create_service: Callable[ @@ -281,7 +258,7 @@ async def create_services_batch( create_task_reservations: Callable[[int, int], dict[str, Any]], service_monitored_labels: dict[DockerLabelKey, str], osparc_docker_label_keys: StandardSimcoreDockerLabels, -) -> Callable[[Resources, str | None, int], Awaitable[list[Service]]]: +) -> Callable[[Resources, InstanceTypeType | None, int], Awaitable[list[Service]]]: async def _( service_resources: Resources, imposed_instance_type: InstanceTypeType | None, @@ -313,47 +290,6 @@ async def _( return _ -def _create_fake_association( - create_fake_node: Callable, - drained_machine_id: str | None, - terminating_machine_id: str | None, -): - fake_node_to_instance_map = {} - - async def _fake_node_creator( - nodes: list[Node], ec2_instances: list[EC2InstanceData] - ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: - def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: - if instance not in fake_node_to_instance_map: - fake_node = create_fake_node() - assert fake_node.spec - fake_node.spec.availability = Availability.active - assert fake_node.status - fake_node.status.state = NodeState.ready - assert fake_node.spec.labels - fake_node.spec.labels |= { - _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY: arrow.utcnow().isoformat(), - _OSPARC_SERVICE_READY_LABEL_KEY: ( - "true" if instance.id != drained_machine_id else "false" - ), - } - if instance.id == terminating_machine_id: - fake_node.spec.labels |= { - _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY: arrow.utcnow().isoformat() - } - fake_node_to_instance_map[instance] = fake_node - return fake_node_to_instance_map[instance] - - associated_instances = [ - AssociatedInstance(node=_create_fake_node_with_labels(i), ec2_instance=i) - for i in ec2_instances - ] - - return associated_instances, [] - - return _fake_node_creator - - async def test_cluster_scaling_with_no_services_does_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -550,7 +486,7 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - _assert_cluster_state( + assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=0 ) @@ -599,7 +535,7 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - _assert_cluster_state( + assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=1 ) @@ -1309,14 +1245,14 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 expected_additional_tag_keys=list(ec2_instance_custom_tags), instance_filters=instance_type_filters, ) - _assert_cluster_state( + assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=0, ) mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) mocked_associate_ec2_instances_with_nodes.reset_mock() - mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( create_fake_node, None, None ) @@ -1325,7 +1261,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - _assert_cluster_state( + assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, @@ -1338,7 +1274,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 ) # - # 2. now we start the second batch of services requiring a different type of machines + # 3. now we start the second batch of services requiring a different type of machines await create_services_batch( scale_up_params2.service_resources, scale_up_params2.imposed_instance_type, @@ -1418,7 +1354,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 drained_machine_instance_id = spied_cluster_analysis.spy_return.active_nodes[ -1 ].ec2_instance.id - mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( create_fake_node, drained_machine_instance_id, None ) await auto_scale_cluster( @@ -1447,7 +1383,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 ) # and this should now recognize the node as terminating - mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( create_fake_node, drained_machine_instance_id, drained_machine_instance_id ) await auto_scale_cluster( From eb13e88874767b929d24076559731ff2bc36a152 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:32:37 +0100 Subject: [PATCH 31/43] ruff --- services/autoscaling/tests/unit/conftest.py | 47 +++++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 4ec042ce65f..4a48f2776b6 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -178,7 +178,11 @@ def app_with_docker_join_drained( @pytest.fixture(scope="session") def fake_ssm_settings() -> SSMSettings: assert "json_schema_extra" in SSMSettings.model_config - return SSMSettings(**SSMSettings.model_config["json_schema_extra"]["examples"][0]) + assert isinstance(SSMSettings.model_config["json_schema_extra"], dict) + assert isinstance(SSMSettings.model_config["json_schema_extra"]["examples"], list) + return SSMSettings.model_validate( + SSMSettings.model_config["json_schema_extra"]["examples"][0] + ) @pytest.fixture @@ -222,6 +226,11 @@ def app_environment( delenvs_from_dict(monkeypatch, mock_env_devel_environment, raising=False) return setenvs_from_dict(monkeypatch, {**external_envfile_dict}) + assert "json_schema_extra" in EC2InstanceBootSpecific.model_config + assert isinstance(EC2InstanceBootSpecific.model_config["json_schema_extra"], dict) + assert isinstance( + EC2InstanceBootSpecific.model_config["json_schema_extra"]["examples"], list + ) envs = setenvs_from_dict( monkeypatch, { @@ -265,6 +274,11 @@ def mocked_ec2_instances_envs( aws_allowed_ec2_instance_type_names: list[InstanceTypeType], aws_instance_profile: str, ) -> EnvVarsDict: + assert "json_schema_extra" in EC2InstanceBootSpecific.model_config + assert isinstance(EC2InstanceBootSpecific.model_config["json_schema_extra"], dict) + assert isinstance( + EC2InstanceBootSpecific.model_config["json_schema_extra"]["examples"], list + ) envs = setenvs_from_dict( monkeypatch, { @@ -273,10 +287,13 @@ def mocked_ec2_instances_envs( "EC2_INSTANCES_SUBNET_ID": aws_subnet_id, "EC2_INSTANCES_ALLOWED_TYPES": json.dumps( { - ec2_type_name: random.choice( # noqa: S311 - EC2InstanceBootSpecific.model_config["json_schema_extra"][ - "examples" - ] + ec2_type_name: cast( + dict, + random.choice( # noqa: S311 + EC2InstanceBootSpecific.model_config["json_schema_extra"][ + "examples" + ] + ), ) | {"ami_id": aws_ami_id} for ec2_type_name in aws_allowed_ec2_instance_type_names @@ -493,22 +510,23 @@ def create_fake_node(faker: Faker) -> Callable[..., DockerNode]: def _creator(**node_overrides) -> DockerNode: default_config = { "ID": faker.uuid4(), - "Version": ObjectVersion(Index=faker.pyint()), + "Version": ObjectVersion(index=faker.pyint()), "CreatedAt": datetime.datetime.now(tz=datetime.UTC).isoformat(), "UpdatedAt": datetime.datetime.now(tz=datetime.UTC).isoformat(), "Description": NodeDescription( - Hostname=faker.pystr(), - Resources=ResourceObject( - NanoCPUs=int(9 * 1e9), MemoryBytes=256 * 1024 * 1024 * 1024 + hostname=faker.pystr(), + resources=ResourceObject( + nano_cp_us=int(9 * 1e9), + memory_bytes=TypeAdapter(ByteSize).validate_python("256GiB"), ), ), "Spec": NodeSpec( - Name=None, - Labels=faker.pydict(allowed_types=(str,)), - Role=None, - Availability=Availability.drain, + name=None, + labels=faker.pydict(allowed_types=(str,)), + role=None, + availability=Availability.drain, ), - "Status": NodeStatus(State=NodeState.unknown, Message=None, Addr=None), + "Status": NodeStatus(state=NodeState.unknown, message=None, addr=None), } default_config.update(**node_overrides) return DockerNode(**default_config) @@ -706,6 +724,7 @@ async def _assert_wait_for_service_state( after=after_log(ctx.logger, logging.DEBUG), ) async def _() -> None: + assert service.id services = await async_docker_client.services.list( filters={"id": service.id} ) From a9353d3833c42a7032284462f5b24c96057cb035 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:01:32 +0100 Subject: [PATCH 32/43] ongoing --- ...test_modules_auto_scaling_computational.py | 81 ++++++++++++++----- .../unit/test_modules_auto_scaling_dynamic.py | 13 ++- 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index e8f0207a58c..03824ec1e0a 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -33,10 +33,14 @@ from models_library.rabbitmq_messages import RabbitAutoscalingStatusMessage from pydantic import ByteSize, TypeAdapter from pytest_mock import MockerFixture, MockType +from pytest_simcore.helpers.autoscaling import ( + assert_cluster_state, + create_fake_association, +) from pytest_simcore.helpers.aws_ec2 import assert_autoscaled_computational_ec2_instances from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict from simcore_service_autoscaling.core.settings import ApplicationSettings -from simcore_service_autoscaling.models import Cluster, EC2InstanceData +from simcore_service_autoscaling.models import EC2InstanceData from simcore_service_autoscaling.modules.auto_scaling_core import auto_scale_cluster from simcore_service_autoscaling.modules.auto_scaling_mode_computational import ( ComputationalAutoscaling, @@ -266,19 +270,6 @@ class _ScaleUpParams: expected_num_instances: int -def _assert_cluster_state( - spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int -) -> None: - assert spied_cluster_analysis.call_count > 0 - - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert ( - spied_cluster_analysis.spy_return.total_number_of_machines() - == expected_num_machines - ) - print("current cluster state:", spied_cluster_analysis.spy_return) - - _RESOURCE_TO_DASK_RESOURCE_MAP: Final[dict[str, str]] = {"CPUS": "CPU", "RAM": "RAM"} @@ -1357,6 +1348,10 @@ async def test_cluster_adapts_machines_on_the_fly( ec2_instance_custom_tags: dict[str, str], scale_up_params1: _ScaleUpParams, scale_up_params2: _ScaleUpParams, + mocked_associate_ec2_instances_with_nodes: mock.Mock, + mock_docker_set_node_availability: mock.Mock, + mock_dask_is_worker_connected: mock.Mock, + create_fake_node: Callable[..., DockerNode], mock_docker_tag_node: mock.Mock, spied_cluster_analysis: MockType, mocker: MockerFixture, @@ -1390,13 +1385,59 @@ async def test_cluster_adapts_machines_on_the_fly( expected_additional_tag_keys=list(ec2_instance_custom_tags), ) - _assert_cluster_state( + assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=0, ) - # mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) - # mocked_associate_ec2_instances_with_nodes.reset_mock() - # mocked_associate_ec2_instances_with_nodes.side_effect = _create_fake_association( - # create_fake_node, None, None - # ) + mocked_associate_ec2_instances_with_nodes.assert_called_once_with([], []) + mocked_associate_ec2_instances_with_nodes.reset_mock() + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( + create_fake_node, None, None + ) + mock_docker_tag_node.assert_not_called() + mock_dask_is_worker_connected.assert_not_called() + + # + # 2. now the machines are associated + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + mocked_associate_ec2_instances_with_nodes.assert_called_once() + mock_docker_tag_node.assert_called() + assert ( + mock_docker_tag_node.call_count + == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ) + assert analyzed_cluster.active_nodes + + # + # 3. now we start the second batch of services requiring a different type of machines + second_batch_tasks = await create_tasks_batch(scale_up_params2) + assert second_batch_tasks + + # scaling will do nothing since we have hit the maximum number of machines + for _ in range(3): + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + await assert_autoscaled_computational_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=scale_up_params1.expected_num_instances, + expected_instance_type=scale_up_params1.expected_instance_type, + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 10414a2f0de..16383ba3d57 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1261,7 +1261,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - assert_cluster_state( + analyzed_cluster = assert_cluster_state( spied_cluster_analysis, expected_calls=1, expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, @@ -1272,6 +1272,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 mock_docker_tag_node.call_count == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES ) + assert analyzed_cluster.active_nodes # # 3. now we start the second batch of services requiring a different type of machines @@ -1296,9 +1297,13 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 instance_filters=instance_type_filters, ) - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert spied_cluster_analysis.spy_return.active_nodes - assert not spied_cluster_analysis.spy_return.drained_nodes + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes # now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max # a machine should switch off and another type should be started From 016531e11746e268bd526c43e122826356cf3d2a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:01:48 +0100 Subject: [PATCH 33/43] improve --- .../pytest-simcore/src/pytest_simcore/helpers/autoscaling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py index 4ecc889a96c..d21a436ace6 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py @@ -18,7 +18,7 @@ def assert_cluster_state( spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int -) -> None: +) -> Cluster: assert spied_cluster_analysis.call_count > 0 assert isinstance(spied_cluster_analysis.spy_return, Cluster) @@ -27,6 +27,7 @@ def assert_cluster_state( == expected_num_machines ) print("current cluster state:", spied_cluster_analysis.spy_return) + return spied_cluster_analysis.spy_return def create_fake_association( From 9361517836b6e0b879ed963eaa5d976dc84411f4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:10:35 +0100 Subject: [PATCH 34/43] better log --- packages/aws-library/src/aws_library/ec2/_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/aws-library/src/aws_library/ec2/_client.py b/packages/aws-library/src/aws_library/ec2/_client.py index a40cf794304..276423415a5 100644 --- a/packages/aws-library/src/aws_library/ec2/_client.py +++ b/packages/aws-library/src/aws_library/ec2/_client.py @@ -181,7 +181,8 @@ async def launch_instances( ) instance_ids = [i["InstanceId"] for i in instances["Instances"]] _logger.info( - "New instances launched: %s, waiting for them to start now...", + "%s New instances launched: %s, waiting for them to start now...", + len(instance_ids), instance_ids, ) From 0c4a1e4331dc779e9800851f03e4cd559f8e434d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:10:41 +0100 Subject: [PATCH 35/43] linter --- .../pytest-simcore/src/pytest_simcore/helpers/autoscaling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py index d21a436ace6..b0aca54e088 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py @@ -17,7 +17,7 @@ def assert_cluster_state( - spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int + spied_cluster_analysis: MockType, *, expected_num_machines: int ) -> Cluster: assert spied_cluster_analysis.call_count > 0 @@ -38,7 +38,7 @@ def create_fake_association( fake_node_to_instance_map = {} async def _fake_node_creator( - nodes: list[Node], ec2_instances: list[EC2InstanceData] + _nodes: list[Node], ec2_instances: list[EC2InstanceData] ) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]: def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node: if instance not in fake_node_to_instance_map: From d8a77bd824ebe07c7846efe15072299efad5b491 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:11:32 +0100 Subject: [PATCH 36/43] linter --- .../pytest-simcore/src/pytest_simcore/helpers/autoscaling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py index b0aca54e088..c746d8fd8c7 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py @@ -17,9 +17,9 @@ def assert_cluster_state( - spied_cluster_analysis: MockType, *, expected_num_machines: int + spied_cluster_analysis: MockType, *, expected_calls: int, expected_num_machines: int ) -> Cluster: - assert spied_cluster_analysis.call_count > 0 + assert spied_cluster_analysis.call_count == expected_calls assert isinstance(spied_cluster_analysis.spy_return, Cluster) assert ( From f1abbfb22fcc6f98feae489be4b965a8b2608dec Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:11:44 +0100 Subject: [PATCH 37/43] better logging --- .../modules/auto_scaling_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 472b15bafd2..f430ba508bc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -538,7 +538,8 @@ async def _assign_tasks_to_current_cluster( if unassigned_tasks: _logger.info( - "the current cluster should cope with %s tasks, %s are unnassigned/queued tasks and will need new EC2s", + "the current cluster should cope with %s tasks, %s are unnassigned/queued " + "tasks and need to wait or get new EC2s", len(tasks) - len(unassigned_tasks), len(unassigned_tasks), ) @@ -614,7 +615,8 @@ async def _find_needed_instances( _logger.exception("Unexpected error:") _logger.info( - "found following needed instances: %s", + "found following needed %s instances: %s", + len(needed_new_instance_types_for_tasks), [ f"{i.instance_type.name=}:{i.instance_type.resources} with {len(i.assigned_tasks)} tasks" for i in needed_new_instance_types_for_tasks From 1cac434b95e4f15853e0024ee6f0974f653ed492 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:11:58 +0100 Subject: [PATCH 38/43] cleanup --- .../unit/test_modules_auto_scaling_dynamic.py | 78 +++++++++++-------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 16383ba3d57..eeab2382df4 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1305,7 +1305,8 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 assert analyzed_cluster.active_nodes assert not analyzed_cluster.drained_nodes - # now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max + # + # 4.now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max # a machine should switch off and another type should be started completed_services_to_stop = random.sample( first_batch_services, @@ -1329,16 +1330,19 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert spied_cluster_analysis.spy_return.active_nodes - assert not spied_cluster_analysis.spy_return.drained_nodes - - # the last machine is found empty - mock_docker_set_node_found_empty.assert_called_with( - mock.ANY, - spied_cluster_analysis.spy_return.active_nodes[-1].node, - empty=True, - ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + # the last machine is found empty + mock_docker_set_node_found_empty.assert_called_with( + mock.ANY, + analyzed_cluster.active_nodes[-1].node, + empty=True, + ) # now we mock the get_node_found_empty so the next call will actually drain the machine with mock.patch( @@ -1351,22 +1355,29 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - mocked_get_node_empty_since.assert_called_once() - assert isinstance(spied_cluster_analysis.spy_return, Cluster) - assert spied_cluster_analysis.spy_return.active_nodes - assert not spied_cluster_analysis.spy_return.drained_nodes - # now the drained machine - drained_machine_instance_id = spied_cluster_analysis.spy_return.active_nodes[ - -1 - ].ec2_instance.id + mocked_get_node_empty_since.assert_called_once() + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + # now scaling again should find the drained machine + drained_machine_instance_id = analyzed_cluster.active_nodes[-1].ec2_instance.id mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( create_fake_node, drained_machine_instance_id, None ) await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - assert spied_cluster_analysis.spy_return.active_nodes - assert spied_cluster_analysis.spy_return.drained_nodes + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert analyzed_cluster.drained_nodes # this will initiate termination now with mock.patch( @@ -1380,23 +1391,28 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - mock_docker_tag_node.assert_called_with( - mock.ANY, - spied_cluster_analysis.spy_return.drained_nodes[-1].node, - tags=mock.ANY, - available=False, - ) + mock_docker_tag_node.assert_called_with( + mock.ANY, + analyzed_cluster.drained_nodes[-1].node, + tags=mock.ANY, + available=False, + ) - # and this should now recognize the node as terminating + # scaling again should find the terminating machine mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( create_fake_node, drained_machine_instance_id, drained_machine_instance_id ) await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) - assert spied_cluster_analysis.spy_return.active_nodes - assert not spied_cluster_analysis.spy_return.drained_nodes - assert spied_cluster_analysis.spy_return.terminating_nodes + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + assert analyzed_cluster.terminating_nodes # now this will terminate it and straight away start a new machine type with mock.patch( From b72085dbf85b51d473d91ebb021a1ba884f7cc4f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:12:08 +0100 Subject: [PATCH 39/43] test completed --- ...test_modules_auto_scaling_computational.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 03824ec1e0a..63ca3190f0a 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -1441,3 +1441,151 @@ async def test_cluster_adapts_machines_on_the_fly( ) assert analyzed_cluster.active_nodes assert not analyzed_cluster.drained_nodes + + # + # 4.now we simulate that some of the services in the 1st batch have completed and that we are 1 below the max + # a machine should switch off and another type should be started (just pop the future out of scope) + for _ in range( + scale_up_params1.num_tasks + - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + + 1 + ): + first_batch_tasks.pop() + + # first call to auto_scale_cluster will mark 1 node as empty + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.set_node_found_empty", + autospec=True, + ) as mock_docker_set_node_found_empty: + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + # the last machine is found empty + mock_docker_set_node_found_empty.assert_called_with( + mock.ANY, + analyzed_cluster.active_nodes[-1].node, + empty=True, + ) + + # now we mock the get_node_found_empty so the next call will actually drain the machine + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_empty_since", + autospec=True, + return_value=arrow.utcnow().datetime + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_DRAINING, + ) as mocked_get_node_empty_since: + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + mocked_get_node_empty_since.assert_called_once() + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + # now scaling again should find the drained machine + drained_machine_instance_id = analyzed_cluster.active_nodes[-1].ec2_instance.id + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( + create_fake_node, drained_machine_instance_id, None + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert analyzed_cluster.drained_nodes + + # this will initiate termination now + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_last_readyness_update", + autospec=True, + return_value=arrow.utcnow().datetime + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION, + ): + mock_docker_tag_node.reset_mock() + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + mock_docker_tag_node.assert_called_with( + mock.ANY, + analyzed_cluster.drained_nodes[-1].node, + tags=mock.ANY, + available=False, + ) + + # scaling again should find the terminating machine + mocked_associate_ec2_instances_with_nodes.side_effect = create_fake_association( + create_fake_node, drained_machine_instance_id, drained_machine_instance_id + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) + assert analyzed_cluster.active_nodes + assert not analyzed_cluster.drained_nodes + assert analyzed_cluster.terminating_nodes + + # now this will terminate it and straight away start a new machine type + with mock.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.get_node_termination_started_since", + autospec=True, + return_value=arrow.utcnow().datetime + - 1.5 + * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION, + ): + mocked_docker_remove_node = mocker.patch( + "simcore_service_autoscaling.modules.auto_scaling_core.utils_docker.remove_nodes", + return_value=None, + autospec=True, + ) + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() + ) + mocked_docker_remove_node.assert_called_once() + + # now let's check what we have + all_instances = await ec2_client.describe_instances() + assert len(all_instances["Reservations"]) == 2, "there should be 2 Reservations" + reservation1 = all_instances["Reservations"][0] + assert "Instances" in reservation1 + assert len(reservation1["Instances"]) == ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES} EC2 instances, found {len(reservation1['Instances'])}" + for instance in reservation1["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params1.expected_instance_type + assert "InstanceId" in instance + assert "State" in instance + assert "Name" in instance["State"] + if instance["InstanceId"] == drained_machine_instance_id: + assert instance["State"]["Name"] == "terminated" + else: + assert instance["State"]["Name"] == "running" + + reservation2 = all_instances["Reservations"][1] + assert "Instances" in reservation2 + assert ( + len(reservation2["Instances"]) == 1 + ), f"expected 1 EC2 instances, found {len(reservation2['Instances'])}" + for instance in reservation2["Instances"]: + assert "InstanceType" in instance + assert instance["InstanceType"] == scale_up_params2.expected_instance_type From 2f8bfef62533ec4fcf6f28f9b047c38cba39009b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:16:09 +0100 Subject: [PATCH 40/43] cleanup --- .../src/pytest_simcore/helpers/autoscaling.py | 4 +++- .../tests/unit/test_modules_auto_scaling_computational.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py index c746d8fd8c7..2d6c278d92c 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/autoscaling.py @@ -27,7 +27,9 @@ def assert_cluster_state( == expected_num_machines ) print("current cluster state:", spied_cluster_analysis.spy_return) - return spied_cluster_analysis.spy_return + cluster = spied_cluster_analysis.spy_return + spied_cluster_analysis.reset_mock() + return cluster def create_fake_association( diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 63ca3190f0a..6e7a0d7c828 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -1436,7 +1436,7 @@ async def test_cluster_adapts_machines_on_the_fly( ) analyzed_cluster = assert_cluster_state( spied_cluster_analysis, - expected_calls=1, + expected_calls=3, expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, ) assert analyzed_cluster.active_nodes @@ -1521,6 +1521,11 @@ async def test_cluster_adapts_machines_on_the_fly( await auto_scale_cluster( app=initialized_app, auto_scaling_mode=ComputationalAutoscaling() ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) mock_docker_tag_node.assert_called_with( mock.ANY, analyzed_cluster.drained_nodes[-1].node, From f84796b9514bef04771a9da34dbad92b4216ee6f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:22:57 +0100 Subject: [PATCH 41/43] cleanup --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index eeab2382df4..22312a71ff6 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1299,7 +1299,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 analyzed_cluster = assert_cluster_state( spied_cluster_analysis, - expected_calls=1, + expected_calls=3, expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, ) assert analyzed_cluster.active_nodes @@ -1391,6 +1391,11 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + ) mock_docker_tag_node.assert_called_with( mock.ANY, analyzed_cluster.drained_nodes[-1].node, From 0de1beddbc17e401f0402e249ab743e780722f3d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:39:11 +0100 Subject: [PATCH 42/43] cleanup --- .../unit/test_modules_auto_scaling_dynamic.py | 158 +++++++----------- 1 file changed, 64 insertions(+), 94 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 22312a71ff6..4aa6c302fca 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -258,32 +258,28 @@ async def create_services_batch( create_task_reservations: Callable[[int, int], dict[str, Any]], service_monitored_labels: dict[DockerLabelKey, str], osparc_docker_label_keys: StandardSimcoreDockerLabels, -) -> Callable[[Resources, InstanceTypeType | None, int], Awaitable[list[Service]]]: - async def _( - service_resources: Resources, - imposed_instance_type: InstanceTypeType | None, - num_services: int, - ) -> list[Service]: +) -> Callable[[_ScaleUpParams], Awaitable[list[Service]]]: + async def _(scale_up_params: _ScaleUpParams) -> list[Service]: return await asyncio.gather( *( create_service( task_template | create_task_reservations( - int(service_resources.cpus), - service_resources.ram, + int(scale_up_params.service_resources.cpus), + scale_up_params.service_resources.ram, ), service_monitored_labels | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), "pending", ( [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={imposed_instance_type}" + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params.imposed_instance_type}" ] - if imposed_instance_type + if scale_up_params.imposed_instance_type else [] ), ) - for _ in range(num_services) + for _ in range(scale_up_params.num_services) ) ) @@ -411,28 +407,34 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect ) +@pytest.mark.parametrize( + "scale_up_params", + [ + pytest.param( + _ScaleUpParams( + imposed_instance_type=None, + service_resources=Resources( + cpus=4, ram=TypeAdapter(ByteSize).validate_python("128000Gib") + ), + num_services=1, + expected_instance_type="r5n.4xlarge", + expected_num_instances=1, + ), + id="No explicit instance defined", + ), + ], +) async def test_cluster_scaling_with_service_asking_for_too_much_resources_starts_nothing( minimal_configuration: None, - service_monitored_labels: dict[DockerLabelKey, str], app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str], Awaitable[Service] - ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], mock_launch_instances: mock.Mock, mock_terminate_instances: mock.Mock, mock_rabbitmq_post_message: mock.Mock, + scale_up_params: _ScaleUpParams, ): - task_template_with_too_many_resource = task_template | create_task_reservations( - 1000, 0 - ) - await create_service( - task_template_with_too_many_resource, - service_monitored_labels, - "pending", - ) + await create_services_batch(scale_up_params) await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() @@ -448,9 +450,7 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 *, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_services_batch: Callable[ - [Resources, str | None, int], Awaitable[list[Service]] - ], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -476,11 +476,7 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915 ), "This test is not made to work with more than 1 expected instance. so please adapt if needed" # create the service(s) - created_docker_services = await create_services_batch( - scale_up_params.service_resources, - scale_up_params.imposed_instance_type, - scale_up_params.num_services, - ) + created_docker_services = await create_services_batch(scale_up_params) # this should trigger a scaling up as we have no nodes await auto_scale_cluster( @@ -953,9 +949,7 @@ async def test_cluster_scaling_up_and_down( minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_services_batch: Callable[ - [Resources, str | None, int], Awaitable[list[Service]] - ], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -1023,9 +1017,7 @@ async def test_cluster_scaling_up_and_down_against_aws( external_envfile_dict: EnvVarsDict, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_services_batch: Callable[ - [Resources, str | None, int], Awaitable[list[Service]] - ], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, fake_node: Node, @@ -1105,9 +1097,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( minimal_configuration: None, app_settings: ApplicationSettings, initialized_app: FastAPI, - create_services_batch: Callable[ - [Resources, str | None, int], Awaitable[list[Service]] - ], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_client: EC2Client, mock_docker_tag_node: mock.Mock, scale_up_params: _ScaleUpParams, @@ -1122,11 +1112,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( assert not all_instances["Reservations"] # create several tasks that needs more power - await create_services_batch( - scale_up_params.service_resources, - scale_up_params.imposed_instance_type, - scale_up_params.num_services, - ) + await create_services_batch(scale_up_params) # run the code await auto_scale_cluster( @@ -1198,9 +1184,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 ec2_client: EC2Client, initialized_app: FastAPI, app_settings: ApplicationSettings, - create_services_batch: Callable[ - [Resources, str | None, int], Awaitable[list[Service]] - ], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_instance_custom_tags: dict[str, str], instance_type_filters: Sequence[FilterTypeDef], async_docker_client: aiodocker.Docker, @@ -1226,11 +1210,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 # # 1. create the first batch of services requiring the initial machines - first_batch_services = await create_services_batch( - scale_up_params1.service_resources, - scale_up_params1.imposed_instance_type, - scale_up_params1.num_services, - ) + first_batch_services = await create_services_batch(scale_up_params1) # it will only scale once and do nothing else await auto_scale_cluster( @@ -1276,11 +1256,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 # # 3. now we start the second batch of services requiring a different type of machines - await create_services_batch( - scale_up_params2.service_resources, - scale_up_params2.imposed_instance_type, - scale_up_params2.num_services, - ) + await create_services_batch(scale_up_params2) # scaling will do nothing since we have hit the maximum number of machines for _ in range(3): @@ -1467,12 +1443,18 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 @pytest.mark.parametrize( - "docker_service_imposed_ec2_type, docker_service_ram, expected_ec2_type", + "scale_up_params", [ pytest.param( - None, - TypeAdapter(ByteSize).validate_python("128Gib"), - "r5n.4xlarge", + _ScaleUpParams( + imposed_instance_type=None, + service_resources=Resources( + cpus=4, ram=TypeAdapter(ByteSize).validate_python("128Gib") + ), + num_services=1, + expected_instance_type="r5n.4xlarge", + expected_num_instances=1, + ), id="No explicit instance defined", ), ], @@ -1480,24 +1462,17 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( with_short_ec2_instances_max_start_time: EnvVarsDict, minimal_configuration: None, - service_monitored_labels: dict[DockerLabelKey, str], app_settings: ApplicationSettings, initialized_app: FastAPI, - create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] - ], - task_template: dict[str, Any], - create_task_reservations: Callable[[int, int], dict[str, Any]], + create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]], ec2_client: EC2Client, - docker_service_imposed_ec2_type: InstanceTypeType | None, - docker_service_ram: ByteSize, - expected_ec2_type: InstanceTypeType, mock_find_node_with_name_returns_none: mock.Mock, mock_docker_tag_node: mock.Mock, mock_rabbitmq_post_message: mock.Mock, short_ec2_instance_max_start_time: datetime.timedelta, ec2_instance_custom_tags: dict[str, str], instance_type_filters: Sequence[FilterTypeDef], + scale_up_params: _ScaleUpParams, ): assert app_settings.AUTOSCALING_EC2_INSTANCES assert ( @@ -1507,19 +1482,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( # we have nothing running now all_instances = await ec2_client.describe_instances() assert not all_instances["Reservations"] - # create a service - await create_service( - task_template | create_task_reservations(4, docker_service_ram), - service_monitored_labels, - "pending", - ( - [ - f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={ docker_service_imposed_ec2_type}" - ] - if docker_service_imposed_ec2_type - else [] - ), - ) + await create_services_batch(scale_up_params) + # this should trigger a scaling up as we have no nodes await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() @@ -1529,8 +1493,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( instances = await assert_autoscaled_dynamic_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), instance_filters=instance_type_filters, @@ -1545,7 +1509,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( app_settings, initialized_app, instances_running=0, - instances_pending=1, + instances_pending=scale_up_params.expected_num_instances, ) mock_rabbitmq_post_message.reset_mock() @@ -1572,8 +1536,8 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( instances = await assert_autoscaled_dynamic_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=1, - expected_instance_type=expected_ec2_type, + expected_num_instances=scale_up_params.expected_num_instances, + expected_instance_type=scale_up_params.expected_instance_type, expected_instance_state="running", expected_additional_tag_keys=list(ec2_instance_custom_tags), instance_filters=instance_type_filters, @@ -1585,7 +1549,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( app_settings, initialized_app, instances_running=0, - instances_pending=1, + instances_pending=scale_up_params.expected_num_instances, ) mock_rabbitmq_post_message.reset_mock() assert instances @@ -1616,7 +1580,10 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( all_instances = await ec2_client.describe_instances() assert len(all_instances["Reservations"]) == 2 assert "Instances" in all_instances["Reservations"][0] - assert len(all_instances["Reservations"][0]["Instances"]) == 1 + assert ( + len(all_instances["Reservations"][0]["Instances"]) + == scale_up_params.expected_num_instances + ) assert "State" in all_instances["Reservations"][0]["Instances"][0] assert "Name" in all_instances["Reservations"][0]["Instances"][0]["State"] assert ( @@ -1625,7 +1592,10 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( ) assert "Instances" in all_instances["Reservations"][1] - assert len(all_instances["Reservations"][1]["Instances"]) == 1 + assert ( + len(all_instances["Reservations"][1]["Instances"]) + == scale_up_params.expected_num_instances + ) assert "State" in all_instances["Reservations"][1]["Instances"][0] assert "Name" in all_instances["Reservations"][1]["Instances"][0]["State"] assert ( From 3a40819c26edf12db2d26f2aae8fd0f3bd3a828a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 11 Dec 2024 18:04:51 +0100 Subject: [PATCH 43/43] improve logging --- .../modules/auto_scaling_core.py | 210 ++++++++++-------- 1 file changed, 113 insertions(+), 97 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index f430ba508bc..f86c555a253 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -386,13 +386,18 @@ async def _activate_drained_nodes( if node.assigned_tasks ] - # activate these nodes now - await asyncio.gather( - *( - _activate_and_notify(app, auto_scaling_mode, node) - for node in nodes_to_activate + if not nodes_to_activate: + return cluster + + with log_context( + _logger, logging.INFO, f"activate {len(nodes_to_activate)} drained nodes" + ): + await asyncio.gather( + *( + _activate_and_notify(app, auto_scaling_mode, node) + for node in nodes_to_activate + ) ) - ) new_active_node_ids = {node.ec2_instance.id for node in nodes_to_activate} remaining_drained_nodes = [ node @@ -421,12 +426,17 @@ async def _start_buffer_instances( if not instances_to_start: return cluster # change the buffer machine to an active one - await get_ec2_client(app).set_instances_tags( - instances_to_start, - tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode), - ) + with log_context( + _logger, logging.INFO, f"start {len(instances_to_start)} buffer machines" + ): + await get_ec2_client(app).set_instances_tags( + instances_to_start, + tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode), + ) - started_instances = await get_ec2_client(app).start_instances(instances_to_start) + started_instances = await get_ec2_client(app).start_instances( + instances_to_start + ) started_instance_ids = [i.id for i in started_instances] return dataclasses.replace( @@ -615,10 +625,10 @@ async def _find_needed_instances( _logger.exception("Unexpected error:") _logger.info( - "found following needed %s instances: %s", + "found following %s needed instances: %s", len(needed_new_instance_types_for_tasks), [ - f"{i.instance_type.name=}:{i.instance_type.resources} with {len(i.assigned_tasks)} tasks" + f"{i.instance_type.name}:{i.instance_type.resources} takes {len(i.assigned_tasks)} task{'s' if len(i.assigned_tasks)>1 else ''}" for i in needed_new_instance_types_for_tasks ], ) @@ -810,39 +820,6 @@ async def _launch_instances( return new_pending_instances -async def _scale_up_cluster( - app: FastAPI, - cluster: Cluster, - unassigned_tasks: list, - auto_scaling_mode: BaseAutoscaling, - allowed_instance_types: list[EC2InstanceType], -) -> Cluster: - app_settings: ApplicationSettings = app.state.settings - assert app_settings.AUTOSCALING_EC2_ACCESS # nosec - assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - - # let's start these - if needed_ec2_instances := await _find_needed_instances( - app, unassigned_tasks, allowed_instance_types, cluster, auto_scaling_mode - ): - await auto_scaling_mode.log_message_from_tasks( - app, - unassigned_tasks, - "service is pending due to missing resources, scaling up cluster now...", - level=logging.INFO, - ) - new_pending_instances = await _launch_instances( - app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode - ) - cluster.pending_ec2s.extend( - [NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances] - ) - # NOTE: to check the logs of UserData in EC2 instance - # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance - - return cluster - - async def _find_drainable_nodes( app: FastAPI, cluster: Cluster ) -> list[AssociatedInstance]: @@ -898,23 +875,25 @@ async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster: if not active_empty_instances: return cluster - # drain this empty nodes - updated_nodes: list[Node] = await asyncio.gather( - *( - utils_docker.set_node_osparc_ready( - app_settings, - docker_client, - node.node, - ready=False, + with log_context( + _logger, logging.INFO, f"drain {len(active_empty_instances)} empty nodes" + ): + updated_nodes: list[Node] = await asyncio.gather( + *( + utils_docker.set_node_osparc_ready( + app_settings, + docker_client, + node.node, + ready=False, + ) + for node in active_empty_instances ) - for node in active_empty_instances - ) - ) - if updated_nodes: - _logger.info( - "following nodes were set to drain: '%s'", - f"{[node.description.hostname for node in updated_nodes if node.description]}", ) + if updated_nodes: + _logger.info( + "following nodes were set to drain: '%s'", + f"{[node.description.hostname for node in updated_nodes if node.description]}", + ) newly_drained_instances = [ AssociatedInstance(node=node, ec2_instance=instance.ec2_instance) for instance, node in zip(active_empty_instances, updated_nodes, strict=True) @@ -1021,10 +1000,15 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: if i.ec2_instance.id not in (new_terminating_instance_ids + terminated_instance_ids) ] + still_terminating_nodes = [ + i + for i in cluster.terminating_nodes + if i.ec2_instance.id not in terminated_instance_ids + ] return dataclasses.replace( cluster, drained_nodes=still_drained_nodes, - terminating_nodes=cluster.terminating_nodes + new_terminating_instances, + terminating_nodes=still_terminating_nodes + new_terminating_instances, terminated_instances=cluster.terminated_instances + [ NonAssociatedInstance(ec2_instance=i.ec2_instance) @@ -1119,7 +1103,7 @@ async def _drain_retired_nodes( ) -async def _scale_down_unused_cluster_machines( +async def _scale_down_unused_cluster_instances( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling, @@ -1129,57 +1113,89 @@ async def _scale_down_unused_cluster_machines( return await _try_scale_down_cluster(app, cluster) -async def _autoscale_cluster( +async def _scale_up_cluster( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling, allowed_instance_types: list[EC2InstanceType], + unassigned_tasks: list, ) -> Cluster: - # 1. check if we have pending tasks - unrunnable_tasks = await auto_scaling_mode.list_unrunnable_tasks(app) - _logger.info("found %s unrunnable tasks", len(unrunnable_tasks)) - # NOTE: this function predicts how the backend will assign tasks - queued_or_missing_instance_tasks, cluster = await _assign_tasks_to_current_cluster( - app, unrunnable_tasks, cluster, auto_scaling_mode - ) - # 2. try to activate drained nodes to cover some of the tasks - cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode) - - # 3. start buffer instances to cover the remaining tasks - cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode) - - # 4. scale down unused machines - cluster = await _scale_down_unused_cluster_machines(app, cluster, auto_scaling_mode) - - # 4. let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec + if not unassigned_tasks and ( + len(cluster.buffer_drained_nodes) + >= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ): + return cluster + if ( - queued_or_missing_instance_tasks - or ( - len(cluster.buffer_drained_nodes) - < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ) - and ( - cluster.total_number_of_machines() - < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - ) + cluster.total_number_of_machines() + >= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES ): _logger.info( - "%s unrunnable tasks could not be assigned, slowly trying to scale up...", - len(queued_or_missing_instance_tasks), + "cluster already hit the maximum allowed amount of instances (%s), not scaling up. " + "%s tasks will wait until instances are free.", + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, + len(unassigned_tasks), ) - cluster = await _scale_up_cluster( + return cluster + + # now we scale up + assert app_settings.AUTOSCALING_EC2_ACCESS # nosec + + # let's start these + if needed_ec2_instances := await _find_needed_instances( + app, unassigned_tasks, allowed_instance_types, cluster, auto_scaling_mode + ): + await auto_scaling_mode.log_message_from_tasks( app, - cluster, - queued_or_missing_instance_tasks, - auto_scaling_mode, - allowed_instance_types, + unassigned_tasks, + "service is pending due to missing resources, scaling up cluster now...", + level=logging.INFO, + ) + new_pending_instances = await _launch_instances( + app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode + ) + cluster.pending_ec2s.extend( + [NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances] ) + # NOTE: to check the logs of UserData in EC2 instance + # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance return cluster +async def _autoscale_cluster( + app: FastAPI, + cluster: Cluster, + auto_scaling_mode: BaseAutoscaling, + allowed_instance_types: list[EC2InstanceType], +) -> Cluster: + # 1. check if we have pending tasks + unnasigned_pending_tasks = await auto_scaling_mode.list_unrunnable_tasks(app) + _logger.info("found %s pending tasks", len(unnasigned_pending_tasks)) + # NOTE: this function predicts how the backend will assign tasks + still_pending_tasks, cluster = await _assign_tasks_to_current_cluster( + app, unnasigned_pending_tasks, cluster, auto_scaling_mode + ) + + # 2. activate available drained nodes to cover some of the tasks + cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode) + + # 3. start buffer instances to cover the remaining tasks + cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode) + + # 4. scale down unused instances + cluster = await _scale_down_unused_cluster_instances( + app, cluster, auto_scaling_mode + ) + + # 5. scale up if necessary + return await _scale_up_cluster( + app, cluster, auto_scaling_mode, allowed_instance_types, still_pending_tasks + ) + + async def _notify_autoscaling_status( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling ) -> None: