From 47194b9050fa679203ba3427c4634eb02a91397b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 21 Dec 2023 13:14:36 +0100 Subject: [PATCH 01/86] instance ready is depending on mode --- .../modules/auto_scaling_core.py | 22 +++---------------- .../modules/auto_scaling_mode_base.py | 13 +++++++++++ .../modules/auto_scaling_mode_dynamic.py | 9 +++++++- .../utils/utils_docker.py | 9 ++++++++ 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index d9ded9d9150..789c27cf630 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -15,11 +15,7 @@ Resources, ) from fastapi import FastAPI -from models_library.generated_models.docker_rest_api import ( - Availability, - Node, - NodeState, -) +from models_library.generated_models.docker_rest_api import Node, NodeState from servicelib.logging_utils import log_catch from types_aiobotocore_ec2.literals import InstanceTypeType @@ -78,29 +74,17 @@ async def _analyze_current_cluster( docker_nodes, existing_ec2_instances ) - def _is_node_up_and_available(node: Node, availability: Availability) -> bool: - assert node.Status # nosec - assert node.Spec # nosec - return bool( - node.Status.State == NodeState.ready - and node.Spec.Availability == availability - ) - def _node_not_ready(node: Node) -> bool: assert node.Status # nosec return bool(node.Status.State != NodeState.ready) all_drained_nodes = [ - i - for i in attached_ec2s - if _is_node_up_and_available(i.node, Availability.drain) + i for i in attached_ec2s if auto_scaling_mode.is_instance_drained(i) ] cluster = Cluster( active_nodes=[ - i - for i in attached_ec2s - if _is_node_up_and_available(i.node, Availability.active) + i for i in attached_ec2s if auto_scaling_mode.is_instance_active(app, i) ], drained_nodes=all_drained_nodes[ app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER : diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index d375a511a5e..3a2ef776f9c 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -4,6 +4,7 @@ from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI from models_library.docker import DockerLabelKey +from models_library.generated_models.docker_rest_api import Availability from models_library.generated_models.docker_rest_api import Node as DockerNode from servicelib.logging_utils import LogLevelInt from types_aiobotocore_ec2.literals import InstanceTypeType @@ -13,6 +14,7 @@ AssignedTasksToInstanceType, AssociatedInstance, ) +from ..utils import utils_docker @dataclass @@ -109,3 +111,14 @@ async def compute_cluster_total_resources( app: FastAPI, instances: list[AssociatedInstance] ) -> Resources: ... + + @staticmethod + @abstractmethod + def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + ... + + @staticmethod + def is_instance_drained(instance: AssociatedInstance) -> bool: + return utils_docker.is_node_ready_and_available( + instance.node, Availability.drain + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index 3e1737b814c..8c4d1265263 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -3,7 +3,7 @@ from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI from models_library.docker import DockerLabelKey -from models_library.generated_models.docker_rest_api import Node, Task +from models_library.generated_models.docker_rest_api import Availability, Node, Task from servicelib.logging_utils import LogLevelInt from types_aiobotocore_ec2.literals import InstanceTypeType @@ -133,3 +133,10 @@ async def compute_cluster_total_resources( return await utils_docker.compute_cluster_total_resources( [i.node for i in instances] ) + + @staticmethod + def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + assert app # nosec + return utils_docker.is_node_ready_and_available( + instance.node, Availability.active + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index 42c8e3376f1..3f0743cfdd8 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -20,6 +20,7 @@ DockerLabelKey, ) from models_library.generated_models.docker_rest_api import ( + Availability, Node, NodeState, Service, @@ -537,3 +538,11 @@ def get__new_node_docker_tags( } | {DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY: ec2_instance.type} ) + + +def is_node_ready_and_available(node: Node, availability: Availability) -> bool: + assert node.Status # nosec + assert node.Spec # nosec + return bool( + node.Status.State == NodeState.ready and node.Spec.Availability == availability + ) From ea6f0132c0389a4e4cedbedbc8046fe2d09b5a7f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 21 Dec 2023 13:18:53 +0100 Subject: [PATCH 02/86] hmm not yet totally there --- .../modules/auto_scaling_mode_computational.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 8ad53e676e9..0ca0ea83dea 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -8,7 +8,7 @@ DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY, DockerLabelKey, ) -from models_library.generated_models.docker_rest_api import Node +from models_library.generated_models.docker_rest_api import Availability, Node from pydantic import AnyUrl, ByteSize from servicelib.logging_utils import LogLevelInt from servicelib.utils import logged_gather @@ -149,3 +149,12 @@ async def compute_cluster_total_resources( return await dask.compute_cluster_total_resources( _scheduler_url(app), instances ) + + @staticmethod + def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + if not utils_docker.is_node_ready_and_available( + instance.node, Availability.active + ): + return False + + # now check if dask can be connected From 96114d5fa632eeed3b31b2b0e7d2def54473c86b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:29:57 +0100 Subject: [PATCH 03/86] missing manual testing parts --- services/autoscaling/tests/manual/.env-devel | 1 + services/autoscaling/tests/manual/README.md | 1 + 2 files changed, 2 insertions(+) diff --git a/services/autoscaling/tests/manual/.env-devel b/services/autoscaling/tests/manual/.env-devel index a56c8ab125b..f0c4a27eac4 100644 --- a/services/autoscaling/tests/manual/.env-devel +++ b/services/autoscaling/tests/manual/.env-devel @@ -13,6 +13,7 @@ EC2_INSTANCES_KEY_NAME=XXXXXXXXXX EC2_INSTANCES_NAME_PREFIX=testing-osparc-computational-cluster EC2_INSTANCES_SECURITY_GROUP_IDS="[\"XXXXXXXXXX\"]" EC2_INSTANCES_SUBNET_ID=XXXXXXXXXX +EC2_INSTANCES_CUSTOM_TAGS='{"special": "testing"}' LOG_FORMAT_LOCAL_DEV_ENABLED=True # define the following to activate dynamic autoscaling # NODES_MONITORING_NEW_NODES_LABELS="[\"testing.autoscaled-node\"]" diff --git a/services/autoscaling/tests/manual/README.md b/services/autoscaling/tests/manual/README.md index baa8c88aa94..b7a2f175009 100644 --- a/services/autoscaling/tests/manual/README.md +++ b/services/autoscaling/tests/manual/README.md @@ -9,6 +9,7 @@ The dynamic mode is used directly with docker swarm facilities. 1. AWS EC2 access 2. a machine running in EC2 with docker installed and access to osparc-simcore repository (for example t2.xlarge to have some computational power) +3. Note that VScode remote can be used to directly code on the EC2 instance. ## computational mode From e231f2088bc16febd1531213b3e9dd3c75ad4d85 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 13:21:41 +0000 Subject: [PATCH 04/86] added remote debugging facilities --- .../core/application.py | 4 +++ .../core/settings.py | 2 ++ .../modules/remote_debug.py | 32 +++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/application.py b/services/autoscaling/src/simcore_service_autoscaling/core/application.py index 7f07c67c110..18e4bb3d565 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/application.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/application.py @@ -4,6 +4,7 @@ from servicelib.fastapi.prometheus_instrumentation import ( setup_prometheus_instrumentation, ) +from models_library.basic_types import BootModeEnum from .._meta import ( API_VERSION, @@ -16,6 +17,7 @@ APP_STARTED_DYNAMIC_BANNER_MSG, ) from ..api.routes import setup_api_routes +from ..modules import remote_debug from ..modules.auto_scaling_task import setup as setup_background_task from ..modules.docker import setup as setup_docker from ..modules.ec2 import setup as setup_ec2 @@ -46,6 +48,8 @@ def create_app(settings: ApplicationSettings) -> FastAPI: setup_prometheus_instrumentation(app) # PLUGINS SETUP + if settings.SC_BOOT_MODE == BootModeEnum.DEBUG: + remote_debug.setup(app) setup_api_routes(app) setup_docker(app) setup_rabbitmq(app) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py index 989e9cff3a6..352c3731e42 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py @@ -8,6 +8,7 @@ BootModeEnum, BuildTargetEnum, LogLevel, + PortInt, VersionTag, ) from models_library.docker import DockerLabelKey @@ -182,6 +183,7 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): AUTOSCALING_DEBUG: bool = Field( default=False, description="Debug mode", env=["AUTOSCALING_DEBUG", "DEBUG"] ) + AUTOSCALING_REMOTE_DEBUG_PORT: PortInt = PortInt(3000) AUTOSCALING_LOGLEVEL: LogLevel = Field( LogLevel.INFO, env=["AUTOSCALING_LOGLEVEL", "LOG_LEVEL", "LOGLEVEL"] diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py b/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py new file mode 100644 index 00000000000..f9857ddd75b --- /dev/null +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py @@ -0,0 +1,32 @@ +""" Setup remote debugger with Python Tools for Visual Studio (PTVSD) + +""" +import logging + +from fastapi import FastAPI +from simcore_service_autoscaling.core.settings import get_application_settings + +logger = logging.getLogger(__name__) + + +def setup(app: FastAPI) -> None: + remote_debug_port = get_application_settings(app).AUTOSCALING_REMOTE_DEBUG_PORT + + def on_startup() -> None: + try: + logger.debug("Enabling attach ptvsd ...") + # + # SEE https://github.com/microsoft/ptvsd#enabling-debugging + # + import ptvsd + + ptvsd.enable_attach( + address=("0.0.0.0", remote_debug_port), # nosec # noqa: S104 + ) # nosec + except ImportError as err: + msg = "Cannot enable remote debugging. Please install ptvsd first" + raise RuntimeError(msg) from err + + logger.info("Remote debugging enabled: listening port %s", remote_debug_port) + + app.add_event_handler("startup", on_startup) From 8758ee985e3ef44ec7cab522d0e0b452263cff0f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:18:57 +0000 Subject: [PATCH 05/86] added is_worker_connected and refcator --- .../modules/dask.py | 92 ++++++++++--------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 7982672f91d..4a9fe6a0974 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -49,47 +49,6 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: raise DaskSchedulerNotFoundError(url=url) from exc -async def list_unrunnable_tasks(url: AnyUrl) -> list[DaskTask]: - """ - Raises: - DaskSchedulerNotFoundError - """ - - def _list_tasks( - dask_scheduler: distributed.Scheduler, - ) -> dict[str, dict[str, Any]]: - return { - task.key: task.resource_restrictions for task in dask_scheduler.unrunnable - } - - async with _scheduler_client(url) as client: - list_of_tasks: dict[ - DaskTaskId, DaskTaskResources - ] = await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) - _logger.info("found unrunnable tasks: %s", list_of_tasks) - return [ - DaskTask(task_id=task_id, required_resources=task_resources) - for task_id, task_resources in list_of_tasks.items() - ] - - -async def list_processing_tasks(url: AnyUrl) -> list[DaskTaskId]: - """ - Raises: - DaskSchedulerNotFoundError - """ - async with _scheduler_client(url) as client: - processing_tasks = set() - if worker_to_processing_tasks := await _wrap_client_async_routine( - client.processing() - ): - _logger.info("cluster worker processing: %s", worker_to_processing_tasks) - for tasks in worker_to_processing_tasks.values(): - processing_tasks |= set(tasks) - - return list(processing_tasks) - - DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] @@ -131,6 +90,57 @@ def _find_by_worker_host( return next(iter(filtered_workers.items())) +async def is_worker_connected( + scheduler_url: AnyUrl, worker_ec2_instance: EC2InstanceData +) -> bool: + with contextlib.suppress(DaskNoWorkersError, DaskWorkerNotFoundError): + async with _scheduler_client(scheduler_url) as client: + _dask_worker_from_ec2_instance(client, worker_ec2_instance) + return True + return False + + +async def list_unrunnable_tasks(url: AnyUrl) -> list[DaskTask]: + """ + Raises: + DaskSchedulerNotFoundError + """ + + def _list_tasks( + dask_scheduler: distributed.Scheduler, + ) -> dict[str, dict[str, Any]]: + return { + task.key: task.resource_restrictions for task in dask_scheduler.unrunnable + } + + async with _scheduler_client(url) as client: + list_of_tasks: dict[ + DaskTaskId, DaskTaskResources + ] = await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) + _logger.debug("found unrunnable tasks: %s", list_of_tasks) + return [ + DaskTask(task_id=task_id, required_resources=task_resources) + for task_id, task_resources in list_of_tasks.items() + ] + + +async def list_processing_tasks(url: AnyUrl) -> list[DaskTaskId]: + """ + Raises: + DaskSchedulerNotFoundError + """ + async with _scheduler_client(url) as client: + processing_tasks = set() + if worker_to_processing_tasks := await _wrap_client_async_routine( + client.processing() + ): + _logger.info("cluster worker processing: %s", worker_to_processing_tasks) + for tasks in worker_to_processing_tasks.values(): + processing_tasks |= set(tasks) + + return list(processing_tasks) + + async def get_worker_still_has_results_in_memory( url: AnyUrl, ec2_instance: EC2InstanceData ) -> int: From 0e48b8576ce4fe9945cd526ba7d76b98c83afa30 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:19:12 +0000 Subject: [PATCH 06/86] added pending_nodes --- .../src/simcore_service_autoscaling/models.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index de2df37d83c..b441bdfb344 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -25,16 +25,21 @@ class AssociatedInstance: ec2_instance: EC2InstanceData -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True, slots=True) class Cluster: active_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is active (with running tasks)" + "description": "This is a EC2 backed docker node which is active and ready to receive tasks (or with running tasks)" + } + ) + pending_nodes: list[AssociatedInstance] = field( + metadata={ + "description": "This is a EC2 backed docker node which is active and NOT yet ready to receive tasks" } ) drained_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is drained (with no tasks)" + "description": "This is a EC2 backed docker node which is drained (cannot accept tasks)" } ) reserve_drained_nodes: list[AssociatedInstance] = field( From 1b152215c4b0e42c7e61a60974c2536c5e1d3236 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:19:51 +0000 Subject: [PATCH 07/86] use pending nodes and refactor also improve logging --- .../modules/auto_scaling_core.py | 50 ++++++++++++------- .../modules/auto_scaling_mode_base.py | 2 +- .../auto_scaling_mode_computational.py | 41 +++++++++------ .../modules/auto_scaling_mode_dynamic.py | 2 +- 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 789c27cf630..c2c7800de63 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -3,6 +3,7 @@ import dataclasses import datetime import itertools +import json import logging from typing import cast @@ -15,6 +16,7 @@ Resources, ) from fastapi import FastAPI +from fastapi.encoders import jsonable_encoder from models_library.generated_models.docker_rest_api import Node, NodeState from servicelib.logging_utils import log_catch from types_aiobotocore_ec2.literals import InstanceTypeType @@ -82,10 +84,16 @@ def _node_not_ready(node: Node) -> bool: i for i in attached_ec2s if auto_scaling_mode.is_instance_drained(i) ] + active_nodes, attached_not_yet_ready_nodes = [], [] + for instance in attached_ec2s: + if await auto_scaling_mode.is_instance_active(app, instance): + active_nodes.append(instance) + else: + attached_not_yet_ready_nodes.append(instance) + cluster = Cluster( - active_nodes=[ - i for i in attached_ec2s if auto_scaling_mode.is_instance_active(app, i) - ], + active_nodes=active_nodes, + pending_nodes=attached_not_yet_ready_nodes, drained_nodes=all_drained_nodes[ app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER : ], @@ -96,7 +104,9 @@ def _node_not_ready(node: Node) -> bool: terminated_instances=terminated_ec2_instances, disconnected_nodes=[n for n in docker_nodes if _node_not_ready(n)], ) - _logger.info("current state: %s", f"{cluster=}") + _logger.debug( + "current state: %s", f"{json.dumps(jsonable_encoder( cluster), indent=2)}" + ) return cluster @@ -316,7 +326,7 @@ async def _find_needed_instances( auto_scaling_mode: BaseAutoscaling, ) -> dict[EC2InstanceType, int]: # 1. check first the pending task needs - active_instances_to_tasks: list[AssignedTasksToInstance] = [ + active_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( instance=i.ec2_instance, assigned_tasks=[], @@ -325,13 +335,13 @@ async def _find_needed_instances( ) for i in cluster.active_nodes ] - pending_instances_to_tasks: list[AssignedTasksToInstance] = [ + pending_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( instance=i, assigned_tasks=[], available_resources=i.resources ) - for i in cluster.pending_ec2s + for i in cluster.pending_ec2s + [i.ec2_instance for i in cluster.pending_nodes] ] - drained_instances_to_tasks: list[AssignedTasksToInstance] = [ + drained_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( instance=i.ec2_instance, assigned_tasks=[], @@ -356,9 +366,9 @@ async def _find_needed_instances( task, auto_scaling_mode, task_defined_ec2_type, - active_instances_to_tasks, - pending_instances_to_tasks, - drained_instances_to_tasks, + active_ec2s_to_tasks, + pending_ec2s_to_tasks, + drained_ec2s_to_tasks, needed_new_instance_types_for_tasks, ): continue @@ -414,7 +424,7 @@ async def _find_needed_instances( ) > 0: # check if some are already pending remaining_pending_instances = [ - i.instance for i in pending_instances_to_tasks if not i.assigned_tasks + i.instance for i in pending_ec2s_to_tasks if not i.assigned_tasks ] if len(remaining_pending_instances) < ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER @@ -639,7 +649,12 @@ async def _deactivate_empty_nodes( _logger.exception( "EC2 node instance is not registered to dask-scheduler! TIP: Needs investigation" ) - + if not active_empty_instances: + return cluster + _logger.info( + "following nodes will be drained: '%s'", + f"{[instance.node.Description.Hostname for instance in active_empty_instances if instance.node.Description]}", + ) # drain this empty nodes updated_nodes: list[Node] = await asyncio.gather( *( @@ -696,7 +711,7 @@ async def _find_terminateable_instances( _logger.info( "%s has still %ss before being terminateable", f"{instance.ec2_instance.id=}", - f"{(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION - elapsed_time_since_drained).total_seconds()}", + f"{(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION - elapsed_time_since_drained).total_seconds():.0f}", ) if terminateable_nodes: @@ -753,9 +768,6 @@ async def _autoscale_cluster( still_unrunnable_tasks, cluster = await _activate_drained_nodes( app, cluster, unrunnable_tasks, auto_scaling_mode ) - _logger.info( - "still %s unrunnable tasks after node activation", len(still_unrunnable_tasks) - ) # let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) @@ -764,6 +776,10 @@ async def _autoscale_cluster( len(cluster.reserve_drained_nodes) < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): + _logger.info( + "still %s unrunnable tasks after node activation", + len(still_unrunnable_tasks), + ) # yes? then scale up cluster = await _scale_up_cluster( app, cluster, still_unrunnable_tasks, auto_scaling_mode diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index 3a2ef776f9c..52059e4178e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -114,7 +114,7 @@ async def compute_cluster_total_resources( @staticmethod @abstractmethod - def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: ... @staticmethod diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 0ca0ea83dea..40dd57e59c4 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -12,6 +12,10 @@ from pydantic import AnyUrl, ByteSize from servicelib.logging_utils import LogLevelInt from servicelib.utils import logged_gather +from simcore_service_autoscaling.core.errors import ( + DaskNoWorkersError, + DaskWorkerNotFoundError, +) from types_aiobotocore_ec2.literals import InstanceTypeType from ..core.settings import get_application_settings @@ -117,15 +121,18 @@ async def get_task_defined_instance(app: FastAPI, task) -> InstanceTypeType | No async def compute_node_used_resources( app: FastAPI, instance: AssociatedInstance ) -> Resources: - num_results_in_memory = await dask.get_worker_still_has_results_in_memory( - _scheduler_url(app), instance.ec2_instance - ) - if num_results_in_memory > 0: - # NOTE: this is a trick to consider the node still useful - return Resources(cpus=1, ram=ByteSize()) - return await dask.get_worker_used_resources( - _scheduler_url(app), instance.ec2_instance - ) + try: + num_results_in_memory = await dask.get_worker_still_has_results_in_memory( + _scheduler_url(app), instance.ec2_instance + ) + if num_results_in_memory > 0: + # NOTE: this is a trick to consider the node still useful + return Resources(cpus=1, ram=ByteSize()) + return await dask.get_worker_used_resources( + _scheduler_url(app), instance.ec2_instance + ) + except (DaskWorkerNotFoundError, DaskNoWorkersError): + return Resources.create_as_empty() @staticmethod async def compute_cluster_used_resources( @@ -146,15 +153,21 @@ async def compute_cluster_used_resources( async def compute_cluster_total_resources( app: FastAPI, instances: list[AssociatedInstance] ) -> Resources: - return await dask.compute_cluster_total_resources( - _scheduler_url(app), instances - ) + try: + return await dask.compute_cluster_total_resources( + _scheduler_url(app), instances + ) + except DaskNoWorkersError: + return Resources.create_as_empty() @staticmethod - def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: if not utils_docker.is_node_ready_and_available( instance.node, Availability.active ): return False - # now check if dask can be connected + # now check if dask-scheduler is available + return await dask.is_worker_connected( + _scheduler_url(app), instance.ec2_instance + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index 8c4d1265263..e76b0e32420 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -135,7 +135,7 @@ async def compute_cluster_total_resources( ) @staticmethod - def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: + async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool: assert app # nosec return utils_docker.is_node_ready_and_available( instance.node, Availability.active From 7c6289091f44f2de7b2aec8393226f2ab3688067 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:33:56 +0000 Subject: [PATCH 08/86] ensure negative values for cpus are capped to 0 --- packages/aws-library/src/aws_library/ec2/models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/aws-library/src/aws_library/ec2/models.py b/packages/aws-library/src/aws_library/ec2/models.py index 9b67359c915..0af3e7a9c47 100644 --- a/packages/aws-library/src/aws_library/ec2/models.py +++ b/packages/aws-library/src/aws_library/ec2/models.py @@ -53,6 +53,13 @@ def __sub__(self, other: "Resources") -> "Resources": } ) + @validator("cpus", pre=True) + @classmethod + def ensure_negative_is_0(cls, v: float) -> float: + if v < 0: + return 0 + return v + @dataclass(frozen=True) class EC2InstanceType: From 084e7539d14d11cf0e23fdd12c2501a6776a0437 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:57:04 +0000 Subject: [PATCH 09/86] misconfigured in redis --- .../src/simcore_service_clusters_keeper/utils/clusters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 341d7f9d749..e07c6e571bd 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -67,6 +67,8 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: startup_commands = ec2_boot_specific.custom_boot_scripts.copy() startup_commands.extend( [ + # NOTE: https://stackoverflow.com/questions/41203492/solving-redis-warnings-on-overcommit-memory-and-transparent-huge-pages-for-ubunt + "sysctl vm.overcommit_memory=1", f"echo '{_docker_compose_yml_base64_encoded()}' | base64 -d > docker-compose.yml", "docker swarm init", f"{' '.join(environment_variables)} docker stack deploy --with-registry-auth --compose-file=docker-compose.yml dask_stack", From c53119b577611e5ef8a335f2b8f3f74e65a34d41 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:57:23 +0000 Subject: [PATCH 10/86] renaming + logs --- .../modules/auto_scaling_core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index c2c7800de63..ba95c445612 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -84,16 +84,16 @@ def _node_not_ready(node: Node) -> bool: i for i in attached_ec2s if auto_scaling_mode.is_instance_drained(i) ] - active_nodes, attached_not_yet_ready_nodes = [], [] + active_nodes, pending_nodes = [], [] for instance in attached_ec2s: if await auto_scaling_mode.is_instance_active(app, instance): active_nodes.append(instance) else: - attached_not_yet_ready_nodes.append(instance) + pending_nodes.append(instance) cluster = Cluster( active_nodes=active_nodes, - pending_nodes=attached_not_yet_ready_nodes, + pending_nodes=pending_nodes, drained_nodes=all_drained_nodes[ app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER : ], @@ -140,6 +140,7 @@ async def _try_attach_pending_ec2s( available=False, ) new_found_instances.append(AssociatedInstance(new_node, instance_data)) + _logger.info("Attached new EC2 instance %s", instance_data.id) else: still_pending_ec2s.append(instance_data) except Ec2InvalidDnsNameError: # noqa: PERF203 From b08bedab69f1938e1b3f51fccf9ea0a705a223f0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:58:14 +0000 Subject: [PATCH 11/86] ensure not available scheduler does not break autoscaling --- .../modules/auto_scaling_mode_computational.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 40dd57e59c4..b60732ab147 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -12,12 +12,13 @@ from pydantic import AnyUrl, ByteSize from servicelib.logging_utils import LogLevelInt from servicelib.utils import logged_gather -from simcore_service_autoscaling.core.errors import ( +from types_aiobotocore_ec2.literals import InstanceTypeType + +from ..core.errors import ( DaskNoWorkersError, + DaskSchedulerNotFoundError, DaskWorkerNotFoundError, ) -from types_aiobotocore_ec2.literals import InstanceTypeType - from ..core.settings import get_application_settings from ..models import ( AssignedTasksToInstance, @@ -61,7 +62,13 @@ def get_new_node_docker_tags( @staticmethod async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: - return await dask.list_unrunnable_tasks(_scheduler_url(app)) + try: + return await dask.list_unrunnable_tasks(_scheduler_url(app)) + except DaskSchedulerNotFoundError: + _logger.warning( + "No dask scheduler found. TIP: Normal during machine startup." + ) + return [] @staticmethod def try_assigning_task_to_node( From fb21a6ba40a16c7ae85f08fd0fa7fec63c331db8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 15:15:32 +0000 Subject: [PATCH 12/86] improving feedback --- .../modules/auto_scaling_core.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index ba95c445612..008ef15f66f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -614,8 +614,7 @@ async def _scale_up_cluster( await auto_scaling_mode.log_message_from_tasks( app, pending_tasks, - "service is pending due to missing resources, scaling up cluster now\n" - f"{sum(n for n in needed_ec2_instances.values())} new machines will be added, please wait...", + "service is pending due to missing resources, scaling up cluster now...", level=logging.INFO, ) # NOTE: notify the up-scaling progress started... @@ -623,6 +622,12 @@ async def _scale_up_cluster( new_pending_instances = await _start_instances( app, needed_ec2_instances, pending_tasks, auto_scaling_mode ) + await auto_scaling_mode.log_message_from_tasks( + app, + pending_tasks, + f"{len(new_pending_instances)} new machines being started, please wait...", + level=logging.INFO, + ) cluster.pending_ec2s.extend(new_pending_instances) # NOTE: to check the logs of UserData in EC2 instance # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance From 885694ac560c8fa0156e293f92527bff19c93326 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:00:11 +0000 Subject: [PATCH 13/86] do not expose redis --- .../tests/manual/docker-compose.yml | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/services/autoscaling/tests/manual/docker-compose.yml b/services/autoscaling/tests/manual/docker-compose.yml index 7dbcf5bd4cc..635722243ea 100644 --- a/services/autoscaling/tests/manual/docker-compose.yml +++ b/services/autoscaling/tests/manual/docker-compose.yml @@ -5,7 +5,7 @@ services: init: true hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" ports: - - "5672:5672" + # - "5672:5672" - "15672:15672" - "15692" environment: @@ -23,8 +23,8 @@ services: image: "redis:6.2.6@sha256:4bed291aa5efb9f0d77b76ff7d4ab71eee410962965d052552db1fb80576431d" init: true hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" - ports: - - "6379:6379" + # ports: + # - "6379:6379" healthcheck: test: [ "CMD", "redis-cli", "ping" ] interval: 5s @@ -33,24 +33,6 @@ services: volumes: - redis-data:/data - redis-commander: - image: rediscommander/redis-commander:latest - init: true - hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" - ports: - - "18081:8081" - environment: - - >- - REDIS_HOSTS= - resources:${REDIS_HOST}:${REDIS_PORT}:0, - locks:${REDIS_HOST}:${REDIS_PORT}:1, - validation_codes:${REDIS_HOST}:${REDIS_PORT}:2, - scheduled_maintenance:${REDIS_HOST}:${REDIS_PORT}:3, - user_notifications:${REDIS_HOST}:${REDIS_PORT}:4, - announcements:${REDIS_HOST}:${REDIS_PORT}:5, - distributed_identifiers:${REDIS_HOST}:${REDIS_PORT}:6 - # If you add/remove a db, do not forget to update the --databases entry in the docker-compose.yml - autoscaling: image: local/autoscaling:development dns: 8.8.8.8 # needed to access internet From a6de940c88b4d6b6d711b5b606e271d70818049c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:03:08 +0000 Subject: [PATCH 14/86] minor --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 008ef15f66f..4be848a2b1b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -105,7 +105,8 @@ def _node_not_ready(node: Node) -> bool: disconnected_nodes=[n for n in docker_nodes if _node_not_ready(n)], ) _logger.debug( - "current state: %s", f"{json.dumps(jsonable_encoder( cluster), indent=2)}" + "current state: %s", + f"{json.dumps(jsonable_encoder(cluster), indent=2)}", ) return cluster From e876707970c8ff1ae1aa631276f3446d8112036b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 18:01:55 +0000 Subject: [PATCH 15/86] added function to get all processing tasks per worker --- .../modules/dask.py | 36 +++++++++++++------ .../tests/unit/test_modules_dask.py | 8 ++--- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 4a9fe6a0974..615d19135ed 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -1,5 +1,6 @@ import contextlib import logging +from collections import defaultdict from collections.abc import AsyncIterator, Coroutine from typing import Any, Final, TypeAlias @@ -124,21 +125,36 @@ def _list_tasks( ] -async def list_processing_tasks(url: AnyUrl) -> list[DaskTaskId]: +async def list_processing_tasks_per_worker( + url: AnyUrl, +) -> dict[DaskWorkerUrl, list[DaskTask]]: """ Raises: DaskSchedulerNotFoundError """ + + def _list_tasks( + dask_scheduler: distributed.Scheduler, + ) -> dict[str, list[tuple[DaskTaskId, DaskTaskResources]]]: + worker_to_processing_tasks = defaultdict(list) + for task_key, task_state in dask_scheduler.tasks.items(): + if task_state.processing_on: + worker_to_processing_tasks[task_state.processing_on.address].append( + (task_key, task_state.resource_restrictions) + ) + return worker_to_processing_tasks + async with _scheduler_client(url) as client: - processing_tasks = set() - if worker_to_processing_tasks := await _wrap_client_async_routine( - client.processing() - ): - _logger.info("cluster worker processing: %s", worker_to_processing_tasks) - for tasks in worker_to_processing_tasks.values(): - processing_tasks |= set(tasks) - - return list(processing_tasks) + worker_to_tasks: dict[ + str, list[tuple[DaskTaskId, DaskTaskResources]] + ] = await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) + _logger.debug("found processing tasks: %s", worker_to_tasks) + tasks_per_worker = {} + for worker, tasks in worker_to_tasks.items(): + tasks_per_worker[worker] = [ + DaskTask(task_id=t[0], required_resources=t[1]) for t in tasks + ] + return tasks_per_worker async def get_worker_still_has_results_in_memory( diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 09b23f84b33..24a679773ba 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -28,7 +28,7 @@ _scheduler_client, get_worker_still_has_results_in_memory, get_worker_used_resources, - list_processing_tasks, + list_processing_tasks_per_worker, list_unrunnable_tasks, ) from tenacity import retry, stop_after_delay, wait_fixed @@ -107,13 +107,13 @@ def _add_fct(x: int, y: int) -> int: return x + y # there is nothing now - assert await list_processing_tasks(url=scheduler_url) == [] + assert await list_processing_tasks_per_worker(url=scheduler_url) == [] # this function will be queued and executed as there are no specific resources needed future_queued_task = dask_spec_cluster_client.submit(_add_fct, 2, 5) assert future_queued_task - assert await list_processing_tasks(scheduler_url) == [ + assert await list_processing_tasks_per_worker(scheduler_url) == [ DaskTaskId(future_queued_task.key) ] @@ -121,7 +121,7 @@ def _add_fct(x: int, y: int) -> int: assert result == 7 # nothing processing anymore - assert await list_processing_tasks(url=scheduler_url) == [] + assert await list_processing_tasks_per_worker(url=scheduler_url) == [] _DASK_SCHEDULER_REACTION_TIME_S: Final[int] = 4 From 47d7036ee5bc432f9fdffcaacdf0de57acd159e0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 18:05:46 +0000 Subject: [PATCH 16/86] allow to max out the workers --- .../modules/auto_scaling_mode_computational.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index b60732ab147..b27fb8e9340 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -63,7 +63,16 @@ def get_new_node_docker_tags( @staticmethod async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: try: - return await dask.list_unrunnable_tasks(_scheduler_url(app)) + unrunnable_tasks = await dask.list_unrunnable_tasks(_scheduler_url(app)) + # NOTE: any worker "processing" more than 1 task means that the other tasks are queued! + processing_tasks_by_worker = await dask.list_processing_tasks_per_worker( + _scheduler_url(app) + ) + queued_tasks = [] + for tasks in processing_tasks_by_worker.values(): + queued_tasks += tasks[1:] + _logger.info("found %s potentially queued tasks", len(queued_tasks)) + return unrunnable_tasks + queued_tasks except DaskSchedulerNotFoundError: _logger.warning( "No dask scheduler found. TIP: Normal during machine startup." @@ -83,7 +92,7 @@ async def try_assigning_task_to_instances( pending_task, instances_to_tasks: list[AssignedTasksToInstance], *, - notify_progress: bool + notify_progress: bool, ) -> bool: return await utils.try_assigning_task_to_instances( app, From ad6c735fd82fb0d18880578df60a2c1701d8ff33 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 20:22:30 +0000 Subject: [PATCH 17/86] added worker retierement --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 3 ++- .../modules/auto_scaling_mode_base.py | 5 +++++ .../modules/auto_scaling_mode_computational.py | 4 ++++ .../modules/auto_scaling_mode_dynamic.py | 5 +++++ .../src/simcore_service_autoscaling/modules/dask.py | 5 +++++ 5 files changed, 21 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 4be848a2b1b..4dc1754fa0f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -570,7 +570,7 @@ async def _start_instances( "Exceptionally high load on computational cluster, please try again later.", level=logging.ERROR, ) - elif isinstance(r, Exception): + elif isinstance(r, BaseException): _logger.error("Unexpected error happened when starting EC2 instance: %s", r) last_issue = f"{r}" elif isinstance(r, list): @@ -793,6 +793,7 @@ async def _autoscale_cluster( ) elif still_unrunnable_tasks == unrunnable_tasks: # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust + await auto_scaling_mode.try_retire_nodes(app) cluster = await _deactivate_empty_nodes(app, cluster, auto_scaling_mode) cluster = await _try_scale_down_cluster(app, cluster) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index 52059e4178e..6f512b4e8c5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -122,3 +122,8 @@ def is_instance_drained(instance: AssociatedInstance) -> bool: return utils_docker.is_node_ready_and_available( instance.node, Availability.drain ) + + @staticmethod + @abstractmethod + async def try_retire_nodes(app: FastAPI) -> None: + ... diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index b27fb8e9340..54cff65e18d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -187,3 +187,7 @@ async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool return await dask.is_worker_connected( _scheduler_url(app), instance.ec2_instance ) + + @staticmethod + async def try_retire_nodes(app: FastAPI) -> None: + await dask.try_retire_nodes(_scheduler_url(app)) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index e76b0e32420..3dcf03e36fb 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -140,3 +140,8 @@ async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool return utils_docker.is_node_ready_and_available( instance.node, Availability.active ) + + @staticmethod + async def try_retire_nodes(app: FastAPI) -> None: + assert app # nosec + # nothing to do here diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 615d19135ed..d7d5746b746 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -229,3 +229,8 @@ async def compute_cluster_total_resources( continue return Resources.create_as_empty() + + +async def try_retire_nodes(url: AnyUrl) -> None: + async with _scheduler_client(url) as client: + await _wrap_client_async_routine(client.retire_workers()) From c7bcabbf3b70f541cbeec28afc1ecdfd43ff80cc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 20:53:19 +0000 Subject: [PATCH 18/86] use debugpy --- services/autoscaling/docker/entrypoint.sh | 4 +-- .../modules/remote_debug.py | 32 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/services/autoscaling/docker/entrypoint.sh b/services/autoscaling/docker/entrypoint.sh index 85ad74c8f13..7ae4d22910a 100755 --- a/services/autoscaling/docker/entrypoint.sh +++ b/services/autoscaling/docker/entrypoint.sh @@ -71,8 +71,8 @@ if [ "${SC_BUILD_TARGET}" = "development" ]; then fi if [ "${SC_BOOT_MODE}" = "debug-ptvsd" ]; then - # NOTE: production does NOT pre-installs ptvsd - pip install --no-cache-dir ptvsd + # NOTE: production does NOT pre-installs debugpy + pip install --no-cache-dir debugpy fi # Appends docker group if socket is mounted diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py b/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py index f9857ddd75b..318f7a11a02 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/remote_debug.py @@ -1,32 +1,26 @@ -""" Setup remote debugger with Python Tools for Visual Studio (PTVSD) +""" Setup remote debugger with debugpy - a debugger for Python + https://github.com/microsoft/debugpy """ import logging from fastapi import FastAPI -from simcore_service_autoscaling.core.settings import get_application_settings -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) +_REMOTE_DEBUGGING_PORT = 3000 -def setup(app: FastAPI) -> None: - remote_debug_port = get_application_settings(app).AUTOSCALING_REMOTE_DEBUG_PORT - +def setup_remote_debugging(app: FastAPI) -> None: def on_startup() -> None: try: - logger.debug("Enabling attach ptvsd ...") - # - # SEE https://github.com/microsoft/ptvsd#enabling-debugging - # - import ptvsd - - ptvsd.enable_attach( - address=("0.0.0.0", remote_debug_port), # nosec # noqa: S104 - ) # nosec - except ImportError as err: - msg = "Cannot enable remote debugging. Please install ptvsd first" - raise RuntimeError(msg) from err + _logger.info("Attaching debugpy on %s...", _REMOTE_DEBUGGING_PORT) + + import debugpy - logger.info("Remote debugging enabled: listening port %s", remote_debug_port) + debugpy.listen(("0.0.0.0", _REMOTE_DEBUGGING_PORT)) # nosec # noqa: S104 + + except ImportError as err: # pragma: no cover + msg = "Cannot enable remote debugging. Please install debugpy first" + raise RuntimeError(msg) from err app.add_event_handler("startup", on_startup) From d1e4803ea0648f6f82eb04c939ff9618af0414e5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 20:53:41 +0000 Subject: [PATCH 19/86] use debugpy --- .../src/simcore_service_autoscaling/core/application.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/application.py b/services/autoscaling/src/simcore_service_autoscaling/core/application.py index 18e4bb3d565..5dab6100233 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/application.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/application.py @@ -17,12 +17,12 @@ APP_STARTED_DYNAMIC_BANNER_MSG, ) from ..api.routes import setup_api_routes -from ..modules import remote_debug from ..modules.auto_scaling_task import setup as setup_background_task from ..modules.docker import setup as setup_docker from ..modules.ec2 import setup as setup_ec2 from ..modules.rabbitmq import setup as setup_rabbitmq from ..modules.redis import setup as setup_redis +from ..modules.remote_debug import setup_remote_debugging from .settings import ApplicationSettings logger = logging.getLogger(__name__) @@ -49,7 +49,7 @@ def create_app(settings: ApplicationSettings) -> FastAPI: # PLUGINS SETUP if settings.SC_BOOT_MODE == BootModeEnum.DEBUG: - remote_debug.setup(app) + setup_remote_debugging(app) setup_api_routes(app) setup_docker(app) setup_rabbitmq(app) From 7536eba38e89ba58105395ffa19feceaa963b6f9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 20:54:00 +0000 Subject: [PATCH 20/86] more robust --- .../src/simcore_service_autoscaling/modules/dask.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index d7d5746b746..d7f683fd9d6 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -1,5 +1,6 @@ import contextlib import logging +import re from collections import defaultdict from collections.abc import AsyncIterator, Coroutine from typing import Any, Final, TypeAlias @@ -52,6 +53,7 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] +DASK_NAME_PATTERN: Final[re.Pattern] = re.compile(r"^.+_(ip-\d+-\d+-\d+-\d+).+$") def _dask_worker_from_ec2_instance( @@ -64,7 +66,6 @@ def _dask_worker_from_ec2_instance( DaskWorkerNotFoundError """ node_hostname = node_host_name_from_ec2_private_dns(ec2_instance) - node_ip = node_ip_from_ec2_private_dns(ec2_instance) scheduler_info = client.scheduler_info() assert client.scheduler # nosec if "workers" not in scheduler_info or not scheduler_info["workers"]: @@ -78,16 +79,18 @@ def _find_by_worker_host( dask_worker: tuple[DaskWorkerUrl, DaskWorkerDetails] ) -> bool: _, details = dask_worker - return bool(details["host"] == node_ip) or bool( - node_hostname in details["name"] - ) + if match := re.match(DASK_NAME_PATTERN, details["name"]): + return match.group(0) == node_hostname + return False filtered_workers = dict(filter(_find_by_worker_host, workers.items())) if not filtered_workers: raise DaskWorkerNotFoundError( worker_host=ec2_instance.aws_private_dns, url=client.scheduler.address ) - assert len(filtered_workers) == 1 # nosec + assert ( + len(filtered_workers) == 1 + ), f"returned workers {filtered_workers}, {node_hostname=}" # nosec return next(iter(filtered_workers.items())) From fd8be0e514947418ec9dee8c77a97811dd570221 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 9 Jan 2024 20:54:15 +0000 Subject: [PATCH 21/86] ensure stuff goes in the right location --- .../autoscaling/tests/manual/docker-compose-computational.yml | 4 ++++ services/autoscaling/tests/manual/docker-compose.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/services/autoscaling/tests/manual/docker-compose-computational.yml b/services/autoscaling/tests/manual/docker-compose-computational.yml index ed3aca7d163..462e3990476 100644 --- a/services/autoscaling/tests/manual/docker-compose-computational.yml +++ b/services/autoscaling/tests/manual/docker-compose-computational.yml @@ -40,6 +40,10 @@ services: ports: - 8786:8786 - 8787:8787 + deploy: + placement: + constraints: + - "node.role==manager" volumes: computational_shared_data: diff --git a/services/autoscaling/tests/manual/docker-compose.yml b/services/autoscaling/tests/manual/docker-compose.yml index 635722243ea..3c409170531 100644 --- a/services/autoscaling/tests/manual/docker-compose.yml +++ b/services/autoscaling/tests/manual/docker-compose.yml @@ -47,5 +47,9 @@ services: - "/var/run/docker.sock:/var/run/docker.sock" - ../../:/devel/services/autoscaling - ../../../../packages:/devel/packages + deploy: + placement: + constraints: + - "node.role==manager" volumes: redis-data: From 7e9d183405ecdf16cde2b40858cc786e368f48b6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:35:43 +0000 Subject: [PATCH 22/86] fix regex --- .../src/simcore_service_autoscaling/modules/dask.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index d7f683fd9d6..80a350572aa 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -53,7 +53,9 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] -DASK_NAME_PATTERN: Final[re.Pattern] = re.compile(r"^.+_(ip-\d+-\d+-\d+-\d+).+$") +DASK_NAME_PATTERN: Final[re.Pattern] = re.compile( + r"^.+_(?Pip-\d+-\d+-\d+-\d+).+$" +) def _dask_worker_from_ec2_instance( @@ -80,7 +82,7 @@ def _find_by_worker_host( ) -> bool: _, details = dask_worker if match := re.match(DASK_NAME_PATTERN, details["name"]): - return match.group(0) == node_hostname + return match.group("private_ip") == node_hostname return False filtered_workers = dict(filter(_find_by_worker_host, workers.items())) From dbe2b2d2a9e9fcc71266d84d59449c327404ffda Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:35:56 +0000 Subject: [PATCH 23/86] helper function --- .../autoscaling/src/simcore_service_autoscaling/models.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index b441bdfb344..1ff6aa575bf 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -59,6 +59,14 @@ class Cluster: ) terminated_instances: list[EC2InstanceData] + def need_scaling_down(self) -> bool: + return bool( + self.active_nodes + or self.pending_nodes + or self.drained_nodes + or self.pending_ec2s + ) + DaskTaskId: TypeAlias = str From c3e1d9b3ea0da17fa6edca484ba2cdbd0e9e0b9e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:36:09 +0000 Subject: [PATCH 24/86] improve logs --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 4dc1754fa0f..e812ae515d0 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -784,14 +784,15 @@ async def _autoscale_cluster( < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): _logger.info( - "still %s unrunnable tasks after node activation", + "still %s unrunnable tasks after node activation, try to scale up...", len(still_unrunnable_tasks), ) # yes? then scale up cluster = await _scale_up_cluster( app, cluster, still_unrunnable_tasks, auto_scaling_mode ) - elif still_unrunnable_tasks == unrunnable_tasks: + elif still_unrunnable_tasks == unrunnable_tasks and cluster.need_scaling_down(): + _logger.info("there is 0 waiting task, try to scale down...") # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust await auto_scaling_mode.try_retire_nodes(app) cluster = await _deactivate_empty_nodes(app, cluster, auto_scaling_mode) From 31020589ec429f47a65c0058f2e34a48caa25685 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:36:28 +0000 Subject: [PATCH 25/86] fix dask fun issues --- .../src/simcore_service_autoscaling/modules/dask.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 80a350572aa..26feb3e5417 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -195,7 +195,10 @@ def _get_worker_used_resources( ) -> dict[str, dict]: used_resources = {} for worker_name, worker_state in dask_scheduler.workers.items(): - used_resources[worker_name] = worker_state.used_resources + if worker_state.status is distributed.Status.closing_gracefully: + used_resources[worker_name] = {} + else: + used_resources[worker_name] = worker_state.used_resources return used_resources async with _scheduler_client(url) as client: @@ -238,4 +241,6 @@ async def compute_cluster_total_resources( async def try_retire_nodes(url: AnyUrl) -> None: async with _scheduler_client(url) as client: - await _wrap_client_async_routine(client.retire_workers()) + await _wrap_client_async_routine( + client.retire_workers(close_workers=False, remove=False) + ) From c8acad88ceaad8909cb3c05906980a2e006d11cc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:41:30 +0000 Subject: [PATCH 26/86] refactor --- .../src/simcore_service_autoscaling/models.py | 11 ++++++++++- .../modules/auto_scaling_core.py | 15 +++++++++------ .../modules/auto_scaling_mode_computational.py | 6 +++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 1ff6aa575bf..7ecb5289949 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -59,7 +59,7 @@ class Cluster: ) terminated_instances: list[EC2InstanceData] - def need_scaling_down(self) -> bool: + def can_scale_down(self) -> bool: return bool( self.active_nodes or self.pending_nodes @@ -67,6 +67,15 @@ def need_scaling_down(self) -> bool: or self.pending_ec2s ) + def total_number_of_machines(self) -> int: + return ( + len(self.active_nodes) + + len(self.pending_nodes) + + len(self.drained_nodes) + + len(self.reserve_drained_nodes) + + len(self.pending_ec2s) + ) + DaskTaskId: TypeAlias = str diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index e812ae515d0..c62ee0a965b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -104,7 +104,7 @@ def _node_not_ready(node: Node) -> bool: terminated_instances=terminated_ec2_instances, disconnected_nodes=[n for n in docker_nodes if _node_not_ready(n)], ) - _logger.debug( + _logger.info( "current state: %s", f"{json.dumps(jsonable_encoder(cluster), indent=2)}", ) @@ -779,10 +779,13 @@ async def _autoscale_cluster( # let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - if still_unrunnable_tasks or ( - len(cluster.reserve_drained_nodes) - < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ): + if ( + still_unrunnable_tasks + or ( + len(cluster.reserve_drained_nodes) + < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) + ) and cluster.total_number_of_machines() < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES: _logger.info( "still %s unrunnable tasks after node activation, try to scale up...", len(still_unrunnable_tasks), @@ -791,7 +794,7 @@ async def _autoscale_cluster( cluster = await _scale_up_cluster( app, cluster, still_unrunnable_tasks, auto_scaling_mode ) - elif still_unrunnable_tasks == unrunnable_tasks and cluster.need_scaling_down(): + elif still_unrunnable_tasks == unrunnable_tasks == 0 and cluster.can_scale_down(): _logger.info("there is 0 waiting task, try to scale down...") # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust await auto_scaling_mode.try_retire_nodes(app) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 54cff65e18d..20d333b74a9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -71,7 +71,11 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: queued_tasks = [] for tasks in processing_tasks_by_worker.values(): queued_tasks += tasks[1:] - _logger.info("found %s potentially queued tasks", len(queued_tasks)) + _logger.info( + "found %s unrunnable tasks and %s potentially queued tasks", + len(unrunnable_tasks), + len(queued_tasks), + ) return unrunnable_tasks + queued_tasks except DaskSchedulerNotFoundError: _logger.warning( From 83f2e4e5826cee535984238b9c88ede9d00b5dc8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:45:17 +0000 Subject: [PATCH 27/86] typo --- .../modules/auto_scaling_core.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index c62ee0a965b..472c92f6333 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -104,7 +104,7 @@ def _node_not_ready(node: Node) -> bool: terminated_instances=terminated_ec2_instances, disconnected_nodes=[n for n in docker_nodes if _node_not_ready(n)], ) - _logger.info( + _logger.debug( "current state: %s", f"{json.dumps(jsonable_encoder(cluster), indent=2)}", ) @@ -794,8 +794,14 @@ async def _autoscale_cluster( cluster = await _scale_up_cluster( app, cluster, still_unrunnable_tasks, auto_scaling_mode ) - elif still_unrunnable_tasks == unrunnable_tasks == 0 and cluster.can_scale_down(): - _logger.info("there is 0 waiting task, try to scale down...") + elif ( + len(still_unrunnable_tasks) == len(unrunnable_tasks) == 0 + and cluster.can_scale_down() + ): + _logger.info( + "there is %s waiting task, try to scale down...", + len(still_unrunnable_tasks), + ) # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust await auto_scaling_mode.try_retire_nodes(app) cluster = await _deactivate_empty_nodes(app, cluster, auto_scaling_mode) From 5e64c0d8001f20dd1da13f624c34b7e3a0480ec0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:54:14 +0100 Subject: [PATCH 28/86] manual tester --- .../tests/manual/dask-manual-tester.ipynb | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 services/autoscaling/tests/manual/dask-manual-tester.ipynb diff --git a/services/autoscaling/tests/manual/dask-manual-tester.ipynb b/services/autoscaling/tests/manual/dask-manual-tester.ipynb new file mode 100644 index 00000000000..362b6326b89 --- /dev/null +++ b/services/autoscaling/tests/manual/dask-manual-tester.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import distributed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = distributed.Client(\"tcp://XXXXXXXXXXX:8786\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def square(x):\n", + " import time\n", + " time.sleep(15)\n", + " return x ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "A = client.map(square, range(500), resources={\"CPU\": 1}, pure=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del A\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fct(dask_scheduler: distributed.Scheduler):\n", + " return f\"{dask_scheduler.workers}\"\n", + " return f\"{dask_scheduler.workers_to_close()}\"\n", + "print(client.run_on_scheduler(fct))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 997aaaba530cb8a2585864b5c567f32c3a39aaf4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 17:27:49 +0100 Subject: [PATCH 29/86] mypy --- .../src/simcore_service_autoscaling/modules/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 26feb3e5417..cfc8c456c23 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -82,7 +82,7 @@ def _find_by_worker_host( ) -> bool: _, details = dask_worker if match := re.match(DASK_NAME_PATTERN, details["name"]): - return match.group("private_ip") == node_hostname + return bool(match.group("private_ip") == node_hostname) return False filtered_workers = dict(filter(_find_by_worker_host, workers.items())) @@ -193,7 +193,7 @@ async def get_worker_used_resources( def _get_worker_used_resources( dask_scheduler: distributed.Scheduler, ) -> dict[str, dict]: - used_resources = {} + used_resources: dict[str, dict] = {} for worker_name, worker_state in dask_scheduler.workers.items(): if worker_state.status is distributed.Status.closing_gracefully: used_resources[worker_name] = {} From 17f97202f70d003509414bb3287c73f6772bf962 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 11 Jan 2024 17:50:09 +0100 Subject: [PATCH 30/86] docs --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 5 +++-- .../src/simcore_service_autoscaling/modules/dask.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 472c92f6333..f0a4f24d1b5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -270,7 +270,7 @@ async def _activate_drained_nodes( ) -async def _try_assign_tasks_to_instances( +async def _try_assign_task_to_instances( app: FastAPI, task, auto_scaling_mode: BaseAutoscaling, @@ -337,6 +337,7 @@ async def _find_needed_instances( ) for i in cluster.active_nodes ] + # NOTE: we add pending nodes to pending ec2, since they are both unavailable for now pending_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( instance=i, assigned_tasks=[], available_resources=i.resources @@ -363,7 +364,7 @@ async def _find_needed_instances( if task_defined_ec2_type else "does NOT define ec2 type", ) - if await _try_assign_tasks_to_instances( + if await _try_assign_task_to_instances( app, task, auto_scaling_mode, diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index cfc8c456c23..53f78937a97 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -196,6 +196,7 @@ def _get_worker_used_resources( used_resources: dict[str, dict] = {} for worker_name, worker_state in dask_scheduler.workers.items(): if worker_state.status is distributed.Status.closing_gracefully: + # NOTE: when a worker was retired it is in this state used_resources[worker_name] = {} else: used_resources[worker_name] = worker_state.used_resources From 6d59735a49576061063d45a47bf0b24d92dd2a62 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:43:20 +0000 Subject: [PATCH 31/86] use Resources model --- .../aws-library/src/aws_library/ec2/client.py | 19 +++++++++---------- .../aws-library/src/aws_library/ec2/models.py | 8 +++----- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/client.py b/packages/aws-library/src/aws_library/ec2/client.py index f4b5436bee9..efdd22c3642 100644 --- a/packages/aws-library/src/aws_library/ec2/client.py +++ b/packages/aws-library/src/aws_library/ec2/client.py @@ -90,9 +90,13 @@ async def get_ec2_instance_capabilities( list_instances.append( EC2InstanceType( name=instance["InstanceType"], - cpus=instance["VCpuInfo"]["DefaultVCpus"], - ram=ByteSize( - int(instance["MemoryInfo"]["SizeInMiB"]) * 1024 * 1024 + resources=Resources( + cpus=instance["VCpuInfo"]["DefaultVCpus"], + ram=ByteSize( + int(instance["MemoryInfo"]["SizeInMiB"]) + * 1024 + * 1024 + ), ), ) ) @@ -173,9 +177,7 @@ async def start_aws_instance( tags=parse_obj_as( EC2Tags, {tag["Key"]: tag["Value"] for tag in instance["Tags"]} ), - resources=Resources( - cpus=instance_config.type.cpus, ram=instance_config.type.ram - ), + resources=instance_config.type.resources, ) for instance in instances["Reservations"][0]["Instances"] ] @@ -234,10 +236,7 @@ async def get_instances( else None, type=instance["InstanceType"], state=instance["State"]["Name"], - resources=Resources( - cpus=ec2_instance_types[0].cpus, - ram=ec2_instance_types[0].ram, - ), + resources=ec2_instance_types[0].resources, tags=parse_obj_as( EC2Tags, {tag["Key"]: tag["Value"] for tag in instance["Tags"]}, diff --git a/packages/aws-library/src/aws_library/ec2/models.py b/packages/aws-library/src/aws_library/ec2/models.py index 0af3e7a9c47..1765a325a0e 100644 --- a/packages/aws-library/src/aws_library/ec2/models.py +++ b/packages/aws-library/src/aws_library/ec2/models.py @@ -13,13 +13,12 @@ Extra, Field, NonNegativeFloat, - PositiveInt, validator, ) from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType -class Resources(BaseModel): +class Resources(BaseModel, frozen=True): cpus: NonNegativeFloat ram: ByteSize @@ -61,11 +60,10 @@ def ensure_negative_is_0(cls, v: float) -> float: return v -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True, slots=True) class EC2InstanceType: name: InstanceTypeType - cpus: PositiveInt - ram: ByteSize + resources: Resources InstancePrivateDNSName: TypeAlias = str From 92953cdaec8b595b040f57a7793b45f80a5297b6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:48:35 +0000 Subject: [PATCH 32/86] use resources, and make computing much faster by keeping available resources in instance to tasks --- .../src/simcore_service_autoscaling/models.py | 5 +- .../modules/auto_scaling_core.py | 116 ++++++++++-------- .../modules/auto_scaling_mode_base.py | 2 +- .../auto_scaling_mode_computational.py | 4 +- .../modules/auto_scaling_mode_dynamic.py | 2 +- .../utils/auto_scaling_core.py | 8 +- .../utils/computational_scaling.py | 45 +++---- .../utils/utils_ec2.py | 13 +- .../unit/test_utils_computational_scaling.py | 17 +-- .../autoscaling/tests/unit/test_utils_ec2.py | 7 +- 10 files changed, 117 insertions(+), 102 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 7ecb5289949..627949a242a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -6,17 +6,18 @@ from models_library.generated_models.docker_rest_api import Node -@dataclass(frozen=True, kw_only=True) +@dataclass(kw_only=True, slots=True) class AssignedTasksToInstance: instance: EC2InstanceData available_resources: Resources assigned_tasks: list -@dataclass(frozen=True, kw_only=True) +@dataclass(kw_only=True, slots=True) class AssignedTasksToInstanceType: instance_type: EC2InstanceType assigned_tasks: list + available_resources: Resources @dataclass(frozen=True) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index f0a4f24d1b5..eaa8f7cefde 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -18,7 +18,7 @@ from fastapi import FastAPI from fastapi.encoders import jsonable_encoder from models_library.generated_models.docker_rest_api import Node, NodeState -from servicelib.logging_utils import log_catch +from servicelib.logging_utils import log_catch, log_context from types_aiobotocore_ec2.literals import InstanceTypeType from ..core.errors import ( @@ -292,7 +292,7 @@ async def _try_assign_task_to_instances( drained_instances_to_tasks, needed_new_instance_types_for_tasks, ) - # try to assign the task to one of the active, pending or net created instances + # try to assign the task to one of the active, pending or newly created instances if ( await auto_scaling_mode.try_assigning_task_to_instances( app, @@ -353,61 +353,69 @@ async def _find_needed_instances( for i in cluster.drained_nodes ] needed_new_instance_types_for_tasks: list[AssignedTasksToInstanceType] = [] - for task in pending_tasks: - task_defined_ec2_type = await auto_scaling_mode.get_task_defined_instance( - app, task - ) - _logger.info( - "task %s %s", - task, - f"defines ec2 type as {task_defined_ec2_type}" - if task_defined_ec2_type - else "does NOT define ec2 type", - ) - if await _try_assign_task_to_instances( - app, - task, - auto_scaling_mode, - task_defined_ec2_type, - active_ec2s_to_tasks, - pending_ec2s_to_tasks, - drained_ec2s_to_tasks, - needed_new_instance_types_for_tasks, - ): - continue - - # so we need to find what we can create now - try: - # check if exact instance type is needed first - if task_defined_ec2_type: - defined_ec2 = find_selected_instance_type_for_task( - task_defined_ec2_type, available_ec2_types, auto_scaling_mode, task - ) - needed_new_instance_types_for_tasks.append( - AssignedTasksToInstanceType( - instance_type=defined_ec2, assigned_tasks=[task] + with log_context(_logger, logging.DEBUG, msg="finding needed instances"): + for task in pending_tasks: + task_defined_ec2_type = await auto_scaling_mode.get_task_defined_instance( + app, task + ) + _logger.debug( + "task %s %s", + task, + f"defines ec2 type as {task_defined_ec2_type}" + if task_defined_ec2_type + else "does NOT define ec2 type", + ) + if await _try_assign_task_to_instances( + app, + task, + auto_scaling_mode, + task_defined_ec2_type, + active_ec2s_to_tasks, + pending_ec2s_to_tasks, + drained_ec2s_to_tasks, + needed_new_instance_types_for_tasks, + ): + continue + + # so we need to find what we can create now + try: + # check if exact instance type is needed first + if task_defined_ec2_type: + defined_ec2 = find_selected_instance_type_for_task( + task_defined_ec2_type, + available_ec2_types, + auto_scaling_mode, + task, ) - ) - else: - # we go for best fitting type - best_ec2_instance = utils_ec2.find_best_fitting_ec2_instance( - available_ec2_types, - auto_scaling_mode.get_max_resources_from_task(task), - score_type=utils_ec2.closest_instance_policy, - ) - needed_new_instance_types_for_tasks.append( - AssignedTasksToInstanceType( - instance_type=best_ec2_instance, assigned_tasks=[task] + needed_new_instance_types_for_tasks.append( + AssignedTasksToInstanceType( + instance_type=defined_ec2, + assigned_tasks=[task], + available_resources=defined_ec2.resources, + ) + ) + else: + # we go for best fitting type + best_ec2_instance = utils_ec2.find_best_fitting_ec2_instance( + available_ec2_types, + auto_scaling_mode.get_task_required_resources(task), + score_type=utils_ec2.closest_instance_policy, + ) + needed_new_instance_types_for_tasks.append( + AssignedTasksToInstanceType( + instance_type=best_ec2_instance, + assigned_tasks=[task], + available_resources=best_ec2_instance.resources, + ) ) + except Ec2InstanceNotFoundError: + _logger.exception( + "Task %s needs more resources than any EC2 instance " + "can provide with the current configuration. Please check!", + f"{task}", ) - except Ec2InstanceNotFoundError: - _logger.exception( - "Task %s needs more resources than any EC2 instance " - "can provide with the current configuration. Please check!", - f"{task}", - ) - except Ec2InstanceInvalidError: - _logger.exception("Unexpected error:") + except Ec2InstanceInvalidError: + _logger.exception("Unexpected error:") num_instances_per_type = collections.defaultdict( int, diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index 6f512b4e8c5..e6d19f4aaac 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -83,7 +83,7 @@ async def progress_message_from_tasks( @staticmethod @abstractmethod - def get_max_resources_from_task(task) -> Resources: + def get_task_required_resources(task) -> Resources: ... @staticmethod diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 20d333b74a9..4bbcfe202bc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -129,8 +129,8 @@ async def progress_message_from_tasks(app: FastAPI, tasks: list, progress: float _logger.info("PROGRESS: %s", progress) @staticmethod - def get_max_resources_from_task(task) -> Resources: - return utils.get_max_resources_from_dask_task(task) + def get_task_required_resources(task) -> Resources: + return utils.resources_from_dask_task(task) @staticmethod async def get_task_defined_instance(app: FastAPI, task) -> InstanceTypeType | None: diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index 3dcf03e36fb..7f3605082c1 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -94,7 +94,7 @@ async def progress_message_from_tasks( await progress_tasks_message(app, tasks, progress=progress) @staticmethod - def get_max_resources_from_task(task) -> Resources: + def get_task_required_resources(task) -> Resources: return utils_docker.get_max_resources_from_docker_task(task) @staticmethod diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py index f0fe674e9db..2b8504d7c8d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py @@ -7,7 +7,6 @@ EC2InstanceBootSpecific, EC2InstanceData, EC2InstanceType, - Resources, ) from models_library.generated_models.docker_rest_api import Node from types_aiobotocore_ec2.literals import InstanceTypeType @@ -202,12 +201,13 @@ def find_selected_instance_type_for_task( selected_instance = filtered_instances[0] # check that the assigned resources and the machine resource fit - if auto_scaling_mode.get_max_resources_from_task(task) > Resources( - cpus=selected_instance.cpus, ram=selected_instance.ram + if ( + auto_scaling_mode.get_task_required_resources(task) + > selected_instance.resources ): msg = ( f"Task {task} requires more resources than the selected instance provides." - f" Asked for {selected_instance}, but task needs {auto_scaling_mode.get_max_resources_from_task(task)}. Please check!" + f" Asked for {selected_instance}, but task needs {auto_scaling_mode.get_task_required_resources(task)}. Please check!" ) raise Ec2InstanceInvalidError(msg=msg) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index a640ca9b7c8..b5a4b51374d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -24,7 +24,7 @@ _DEFAULT_MAX_RAM: Final[int] = 1024 -def get_max_resources_from_dask_task(task: DaskTask) -> Resources: +def resources_from_dask_task(task: DaskTask) -> Resources: return Resources( cpus=task.required_resources.get("CPU", _DEFAULT_MAX_CPU), ram=task.required_resources.get("RAM", _DEFAULT_MAX_RAM), @@ -38,23 +38,23 @@ def get_task_instance_restriction(task: DaskTask) -> str | None: return instance_ec2_type -def _compute_tasks_needed_resources(tasks: list[DaskTask]) -> Resources: - total = Resources.create_as_empty() - for t in tasks: - total += get_max_resources_from_dask_task(t) - return total +def _compute_tasks_resources(tasks: list[DaskTask]) -> Resources: + return sum( + (resources_from_dask_task(t) for t in tasks), + Resources.create_as_empty(), + ) def try_assigning_task_to_node( pending_task: DaskTask, instance_to_tasks: Iterable[tuple[AssociatedInstance, list[DaskTask]]], ) -> bool: + task_resources = resources_from_dask_task(pending_task) for instance, node_assigned_tasks in instance_to_tasks: - instance_total_resource = instance.ec2_instance.resources - tasks_needed_resources = _compute_tasks_needed_resources(node_assigned_tasks) + instance_used_resources = _compute_tasks_resources(node_assigned_tasks) if ( - instance_total_resource - tasks_needed_resources - ) >= get_max_resources_from_dask_task(pending_task): + instance.ec2_instance.resources - instance_used_resources + ) >= task_resources: node_assigned_tasks.append(pending_task) return True return False @@ -72,14 +72,11 @@ async def try_assigning_task_to_instances( instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) + task_required_resources = resources_from_dask_task(pending_task) for assigned_tasks_to_instance in instances_to_tasks: - tasks_needed_resources = _compute_tasks_needed_resources( - assigned_tasks_to_instance.assigned_tasks - ) - if ( - assigned_tasks_to_instance.available_resources - tasks_needed_resources - ) >= get_max_resources_from_dask_task(pending_task): + if assigned_tasks_to_instance.available_resources >= task_required_resources: assigned_tasks_to_instance.assigned_tasks.append(pending_task) + assigned_tasks_to_instance.available_resources -= task_required_resources if notify_progress: now = datetime.datetime.now(datetime.timezone.utc) time_since_launch = ( @@ -108,17 +105,15 @@ def try_assigning_task_to_instance_types( pending_task: DaskTask, instance_types_to_tasks: list[AssignedTasksToInstanceType], ) -> bool: + task_required_resources = resources_from_dask_task(pending_task) for assigned_tasks_to_instance_type in instance_types_to_tasks: - instance_total_resource = Resources( - cpus=assigned_tasks_to_instance_type.instance_type.cpus, - ram=assigned_tasks_to_instance_type.instance_type.ram, - ) - tasks_needed_resources = _compute_tasks_needed_resources( - assigned_tasks_to_instance_type.assigned_tasks - ) if ( - instance_total_resource - tasks_needed_resources - ) >= get_max_resources_from_dask_task(pending_task): + assigned_tasks_to_instance_type.available_resources + >= task_required_resources + ): assigned_tasks_to_instance_type.assigned_tasks.append(pending_task) + assigned_tasks_to_instance_type.available_resources -= ( + task_required_resources + ) return True return False diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py index afb4d224311..d0ab38f352d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py @@ -73,12 +73,19 @@ def closest_instance_policy( ec2_instance: EC2InstanceType, resources: Resources, ) -> float: - if ec2_instance.cpus < resources.cpus or ec2_instance.ram < resources.ram: + if ( + ec2_instance.resources.cpus < resources.cpus + or ec2_instance.resources.ram < resources.ram + ): return 0 # compute a score for all the instances that are above expectations # best is the exact ec2 instance - cpu_ratio = float(ec2_instance.cpus - resources.cpus) / float(ec2_instance.cpus) - ram_ratio = float(ec2_instance.ram - resources.ram) / float(ec2_instance.ram) + cpu_ratio = float(ec2_instance.resources.cpus - resources.cpus) / float( + ec2_instance.resources.cpus + ) + ram_ratio = float(ec2_instance.resources.ram - resources.ram) / float( + ec2_instance.resources.ram + ) return 100 * (1.0 - cpu_ratio) * (1.0 - ram_ratio) diff --git a/services/autoscaling/tests/unit/test_utils_computational_scaling.py b/services/autoscaling/tests/unit/test_utils_computational_scaling.py index fa06a5a18b8..737218d4d08 100644 --- a/services/autoscaling/tests/unit/test_utils_computational_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_computational_scaling.py @@ -24,7 +24,7 @@ from simcore_service_autoscaling.utils.computational_scaling import ( _DEFAULT_MAX_CPU, _DEFAULT_MAX_RAM, - get_max_resources_from_dask_task, + resources_from_dask_task, try_assigning_task_to_instance_types, try_assigning_task_to_instances, try_assigning_task_to_node, @@ -64,10 +64,8 @@ ), ], ) -def test_get_max_resources_from_dask_task( - dask_task: DaskTask, expected_resource: Resources -): - assert get_max_resources_from_dask_task(dask_task) == expected_resource +def test_resources_from_dask_task(dask_task: DaskTask, expected_resource: Resources): + assert resources_from_dask_task(dask_task) == expected_resource @pytest.fixture @@ -209,10 +207,15 @@ def test_try_assigning_task_to_instance_types( task = fake_task(required_resources={"CPU": 2}) # create an instance type with some CPUs fake_instance_type = EC2InstanceType( - name=faker.name(), cpus=6, ram=parse_obj_as(ByteSize, "2GiB") + name=faker.name(), + resources=Resources(cpus=6, ram=parse_obj_as(ByteSize, "2GiB")), ) instance_type_to_tasks: list[AssignedTasksToInstanceType] = [ - AssignedTasksToInstanceType(instance_type=fake_instance_type, assigned_tasks=[]) + AssignedTasksToInstanceType( + instance_type=fake_instance_type, + assigned_tasks=[], + available_resources=fake_instance_type.resources, + ) ] # now this should work 3 times assert try_assigning_task_to_instance_types(task, instance_type_to_tasks) is True diff --git a/services/autoscaling/tests/unit/test_utils_ec2.py b/services/autoscaling/tests/unit/test_utils_ec2.py index 8697c8d5f11..49b06586cc8 100644 --- a/services/autoscaling/tests/unit/test_utils_ec2.py +++ b/services/autoscaling/tests/unit/test_utils_ec2.py @@ -34,8 +34,7 @@ def random_fake_available_instances(faker: Faker) -> list[EC2InstanceType]: list_of_instances = [ EC2InstanceType( name=faker.pystr(), - cpus=n, - ram=ByteSize(n), + resources=Resources(cpus=n, ram=ByteSize(n)), ) for n in range(1, 30) ] @@ -59,7 +58,9 @@ async def test_find_best_fitting_ec2_instance_closest_instance_policy_with_resou [ ( Resources(cpus=n, ram=ByteSize(n)), - EC2InstanceType(name="c5ad.12xlarge", cpus=n, ram=ByteSize(n)), + EC2InstanceType( + name="c5ad.12xlarge", resources=Resources(cpus=n, ram=ByteSize(n)) + ), ) for n in range(1, 30) ], From 5df6fc278423cfdcd6a0f53ae27853cb3110a267 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:50:30 +0000 Subject: [PATCH 33/86] type --- .../modules/auto_scaling_mode_computational.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 4bbcfe202bc..e68e462ea51 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -1,6 +1,7 @@ import collections import logging from collections.abc import Iterable +from typing import cast from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI @@ -135,7 +136,7 @@ def get_task_required_resources(task) -> Resources: @staticmethod async def get_task_defined_instance(app: FastAPI, task) -> InstanceTypeType | None: assert app # nosec - return utils.get_task_instance_restriction(task) + return cast(InstanceTypeType | None, utils.get_task_instance_restriction(task)) @staticmethod async def compute_node_used_resources( From 71867f46b979f04403cbc0990c65f36d310503d8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 16:11:11 +0000 Subject: [PATCH 34/86] simplify --- .../modules/auto_scaling_core.py | 23 +++++++++----- .../modules/auto_scaling_mode_base.py | 7 ----- .../auto_scaling_mode_computational.py | 8 ----- .../modules/auto_scaling_mode_dynamic.py | 8 ----- .../utils/computational_scaling.py | 23 +------------- .../utils/dynamic_scaling.py | 24 +-------------- .../unit/test_utils_computational_scaling.py | 30 ------------------- 7 files changed, 17 insertions(+), 106 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index eaa8f7cefde..cf4cb0d920e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -221,10 +221,11 @@ async def _activate_drained_nodes( # nothing to do return [], cluster - activatable_nodes: list[tuple[AssociatedInstance, list]] = [ - ( - node, - [], + activatable_instances: list[AssignedTasksToInstance] = [ + AssignedTasksToInstance( + instance=node.ec2_instance, + available_resources=node.ec2_instance.resources, + assigned_tasks=[], ) for node in itertools.chain( cluster.drained_nodes, cluster.reserve_drained_nodes @@ -234,13 +235,19 @@ async def _activate_drained_nodes( still_pending_tasks = [ task for task in pending_tasks - if not auto_scaling_mode.try_assigning_task_to_node(task, activatable_nodes) + if not await auto_scaling_mode.try_assigning_task_to_instances( + app, task, activatable_instances, notify_progress=False + ) ] nodes_to_activate = [ - (node, assigned_tasks) - for node, assigned_tasks in activatable_nodes - if assigned_tasks + (node, assigned_tasks_to_instance.assigned_tasks) + for assigned_tasks_to_instance, node in zip( + activatable_instances, + itertools.chain(cluster.drained_nodes, cluster.reserve_drained_nodes), + strict=True, + ) + if assigned_tasks_to_instance.assigned_tasks ] # activate these nodes now diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index e6d19f4aaac..d967b1a47f4 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -41,13 +41,6 @@ def get_new_node_docker_tags( async def list_unrunnable_tasks(app: FastAPI) -> list: ... - @staticmethod - @abstractmethod - def try_assigning_task_to_node( - task, instances_to_tasks: list[tuple[AssociatedInstance, list]] - ) -> bool: - ... - @staticmethod @abstractmethod async def try_assigning_task_to_instances( diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index e68e462ea51..e2452f78203 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -1,6 +1,5 @@ import collections import logging -from collections.abc import Iterable from typing import cast from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources @@ -84,13 +83,6 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: ) return [] - @staticmethod - def try_assigning_task_to_node( - task: DaskTask, - instances_to_tasks: Iterable[tuple[AssociatedInstance, list[DaskTask]]], - ) -> bool: - return utils.try_assigning_task_to_node(task, instances_to_tasks) - @staticmethod async def try_assigning_task_to_instances( app: FastAPI, diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index 7f3605082c1..e3205dde911 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -1,5 +1,3 @@ -from collections.abc import Iterable - from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI from models_library.docker import DockerLabelKey @@ -51,12 +49,6 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[Task]: service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS, ) - @staticmethod - def try_assigning_task_to_node( - task, instances_to_tasks: Iterable[tuple[AssociatedInstance, list]] - ) -> bool: - return utils.try_assigning_task_to_node(task, instances_to_tasks) - @staticmethod async def try_assigning_task_to_instances( app: FastAPI, diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index b5a4b51374d..b6be6e11fa9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -1,6 +1,5 @@ import datetime import logging -from collections.abc import Iterable from typing import Final from aws_library.ec2.models import Resources @@ -11,12 +10,7 @@ from servicelib.utils_formatting import timedelta_as_minute_second from ..core.settings import get_application_settings -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, - DaskTask, -) +from ..models import AssignedTasksToInstance, AssignedTasksToInstanceType, DaskTask _logger = logging.getLogger(__name__) @@ -45,21 +39,6 @@ def _compute_tasks_resources(tasks: list[DaskTask]) -> Resources: ) -def try_assigning_task_to_node( - pending_task: DaskTask, - instance_to_tasks: Iterable[tuple[AssociatedInstance, list[DaskTask]]], -) -> bool: - task_resources = resources_from_dask_task(pending_task) - for instance, node_assigned_tasks in instance_to_tasks: - instance_used_resources = _compute_tasks_resources(node_assigned_tasks) - if ( - instance.ec2_instance.resources - instance_used_resources - ) >= task_resources: - node_assigned_tasks.append(pending_task) - return True - return False - - async def try_assigning_task_to_instances( app: FastAPI, pending_task: DaskTask, diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py index a0d590727f2..77366751183 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py @@ -1,6 +1,5 @@ import datetime import logging -from collections.abc import Iterable from aws_library.ec2.models import Resources from fastapi import FastAPI @@ -8,34 +7,13 @@ from servicelib.utils_formatting import timedelta_as_minute_second from ..core.settings import get_application_settings -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, -) +from ..models import AssignedTasksToInstance, AssignedTasksToInstanceType from . import utils_docker from .rabbitmq import log_tasks_message, progress_tasks_message logger = logging.getLogger(__name__) -def try_assigning_task_to_node( - pending_task: Task, - instances_to_tasks: Iterable[tuple[AssociatedInstance, list[Task]]], -) -> bool: - for instance, node_assigned_tasks in instances_to_tasks: - instance_total_resource = instance.ec2_instance.resources - tasks_needed_resources = utils_docker.compute_tasks_needed_resources( - node_assigned_tasks - ) - if ( - instance_total_resource - tasks_needed_resources - ) >= utils_docker.get_max_resources_from_docker_task(pending_task): - node_assigned_tasks.append(pending_task) - return True - return False - - def try_assigning_task_to_instance_types( pending_task: Task, instance_types_to_tasks: list[AssignedTasksToInstanceType], diff --git a/services/autoscaling/tests/unit/test_utils_computational_scaling.py b/services/autoscaling/tests/unit/test_utils_computational_scaling.py index 737218d4d08..750f43e2b03 100644 --- a/services/autoscaling/tests/unit/test_utils_computational_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_computational_scaling.py @@ -27,7 +27,6 @@ resources_from_dask_task, try_assigning_task_to_instance_types, try_assigning_task_to_instances, - try_assigning_task_to_node, ) @@ -93,13 +92,6 @@ def _creator(**overrides) -> DaskTask: return _creator -async def test_try_assigning_task_to_node_with_no_instances( - fake_task: Callable[..., DaskTask], -): - task = fake_task() - assert try_assigning_task_to_node(task, []) is False - - @pytest.fixture def fake_associated_host_instance( host_node: DockerNode, @@ -111,28 +103,6 @@ def fake_associated_host_instance( ) -async def test_try_assigning_task_to_node( - fake_task: Callable[..., DaskTask], - fake_associated_host_instance: AssociatedInstance, -): - task = fake_task(required_resources={"CPU": 2}) - assert fake_associated_host_instance.node.Description - assert fake_associated_host_instance.node.Description.Resources - # we set the node to have 4 CPUs - fake_associated_host_instance.node.Description.Resources.NanoCPUs = int(4e9) - instance_to_tasks: list[tuple[AssociatedInstance, list[DaskTask]]] = [ - (fake_associated_host_instance, []) - ] - assert try_assigning_task_to_node(task, instance_to_tasks) is True - assert instance_to_tasks[0][1] == [task] - # this should work again - assert try_assigning_task_to_node(task, instance_to_tasks) is True - assert instance_to_tasks[0][1] == [task, task] - # this should now fail - assert try_assigning_task_to_node(task, instance_to_tasks) is False - assert instance_to_tasks[0][1] == [task, task] - - async def test_try_assigning_task_to_instances_with_no_instances( fake_app: mock.Mock, fake_task: Callable[..., DaskTask], From b82c7f1be1a14dcaa1e8989e53d9ba0869f67942 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 16:23:02 +0000 Subject: [PATCH 35/86] refactor --- .../src/simcore_service_autoscaling/models.py | 14 ++++++++ .../modules/auto_scaling_core.py | 17 +++++---- .../utils/computational_scaling.py | 24 ++++++------- .../utils/dynamic_scaling.py | 35 +++++++------------ 4 files changed, 46 insertions(+), 44 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 627949a242a..27f0d1cc2b0 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -12,6 +12,13 @@ class AssignedTasksToInstance: available_resources: Resources assigned_tasks: list + def has_resources_for_task(self, task_resources: Resources) -> bool: + return bool(self.available_resources >= task_resources) + + def assign_task(self, task, task_resources: Resources) -> None: + self.assigned_tasks.append(task) + self.available_resources -= task_resources + @dataclass(kw_only=True, slots=True) class AssignedTasksToInstanceType: @@ -19,6 +26,13 @@ class AssignedTasksToInstanceType: assigned_tasks: list available_resources: Resources + def has_resources_for_task(self, task_resources: Resources) -> bool: + return bool(self.available_resources >= task_resources) + + def assign_task(self, task, task_resources: Resources) -> None: + self.assigned_tasks.append(task) + self.available_resources -= task_resources + @dataclass(frozen=True) class AssociatedInstance: diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index cf4cb0d920e..edb03583803 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -795,13 +795,15 @@ async def _autoscale_cluster( # let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - if ( - still_unrunnable_tasks - or ( - len(cluster.reserve_drained_nodes) - < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ) - ) and cluster.total_number_of_machines() < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES: + if still_unrunnable_tasks or ( + len(cluster.reserve_drained_nodes) + < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ): + # we might want to scale up if we do not already have reached the maximum amount of machines + # if ( + # cluster.total_number_of_machines() + # < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + # ): _logger.info( "still %s unrunnable tasks after node activation, try to scale up...", len(still_unrunnable_tasks), @@ -810,6 +812,7 @@ async def _autoscale_cluster( cluster = await _scale_up_cluster( app, cluster, still_unrunnable_tasks, auto_scaling_mode ) + # give feedback on machine creation elif ( len(still_unrunnable_tasks) == len(unrunnable_tasks) == 0 and cluster.can_scale_down() diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index b6be6e11fa9..27a8ef1a718 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -41,7 +41,7 @@ def _compute_tasks_resources(tasks: list[DaskTask]) -> Resources: async def try_assigning_task_to_instances( app: FastAPI, - pending_task: DaskTask, + task: DaskTask, instances_to_tasks: list[AssignedTasksToInstance], *, notify_progress: bool, @@ -51,11 +51,11 @@ async def try_assigning_task_to_instances( instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) - task_required_resources = resources_from_dask_task(pending_task) + task_required_resources = resources_from_dask_task(task) for assigned_tasks_to_instance in instances_to_tasks: - if assigned_tasks_to_instance.available_resources >= task_required_resources: - assigned_tasks_to_instance.assigned_tasks.append(pending_task) - assigned_tasks_to_instance.available_resources -= task_required_resources + if assigned_tasks_to_instance.has_resources_for_task(task_required_resources): + assigned_tasks_to_instance.assign_task(task, task_required_resources) + if notify_progress: now = datetime.datetime.now(datetime.timezone.utc) time_since_launch = ( @@ -81,18 +81,14 @@ async def try_assigning_task_to_instances( def try_assigning_task_to_instance_types( - pending_task: DaskTask, + task: DaskTask, instance_types_to_tasks: list[AssignedTasksToInstanceType], ) -> bool: - task_required_resources = resources_from_dask_task(pending_task) + task_required_resources = resources_from_dask_task(task) for assigned_tasks_to_instance_type in instance_types_to_tasks: - if ( - assigned_tasks_to_instance_type.available_resources - >= task_required_resources + if assigned_tasks_to_instance_type.has_resources_for_task( + task_required_resources ): - assigned_tasks_to_instance_type.assigned_tasks.append(pending_task) - assigned_tasks_to_instance_type.available_resources -= ( - task_required_resources - ) + assigned_tasks_to_instance_type.assign_task(task) return True return False diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py index 77366751183..5203dd17f0b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py @@ -1,7 +1,6 @@ import datetime import logging -from aws_library.ec2.models import Resources from fastapi import FastAPI from models_library.generated_models.docker_rest_api import Task from servicelib.utils_formatting import timedelta_as_minute_second @@ -15,28 +14,22 @@ def try_assigning_task_to_instance_types( - pending_task: Task, + task: Task, instance_types_to_tasks: list[AssignedTasksToInstanceType], ) -> bool: + task_required_resources = utils_docker.get_max_resources_from_docker_task(task) for assigned_tasks_to_instance_type in instance_types_to_tasks: - instance_total_resource = Resources( - cpus=assigned_tasks_to_instance_type.instance_type.cpus, - ram=assigned_tasks_to_instance_type.instance_type.ram, - ) - tasks_needed_resources = utils_docker.compute_tasks_needed_resources( - assigned_tasks_to_instance_type.assigned_tasks - ) - if ( - instance_total_resource - tasks_needed_resources - ) >= utils_docker.get_max_resources_from_docker_task(pending_task): - assigned_tasks_to_instance_type.assigned_tasks.append(pending_task) + if assigned_tasks_to_instance_type.has_resources_for_task( + task_required_resources + ): + assigned_tasks_to_instance_type.assign_task(task) return True return False async def try_assigning_task_to_instances( app: FastAPI, - pending_task: Task, + task: Task, instances_to_tasks: list[AssignedTasksToInstance], *, notify_progress: bool, @@ -46,14 +39,10 @@ async def try_assigning_task_to_instances( instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) + task_required_resources = utils_docker.get_max_resources_from_docker_task(task) for assigned_tasks_to_instance in instances_to_tasks: - tasks_needed_resources = utils_docker.compute_tasks_needed_resources( - assigned_tasks_to_instance.assigned_tasks - ) - if ( - assigned_tasks_to_instance.available_resources - tasks_needed_resources - ) >= utils_docker.get_max_resources_from_docker_task(pending_task): - assigned_tasks_to_instance.assigned_tasks.append(pending_task) + if assigned_tasks_to_instance.has_resources_for_task(task_required_resources): + assigned_tasks_to_instance.assign_task(task, task_required_resources) if notify_progress: now = datetime.datetime.now(datetime.timezone.utc) time_since_launch = ( @@ -67,13 +56,13 @@ async def try_assigning_task_to_instances( await log_tasks_message( app, - [pending_task], + [task], f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}, " f"est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait...", ) await progress_tasks_message( app, - [pending_task], + [task], time_since_launch.total_seconds() / instance_max_time_to_start.total_seconds(), ) From 726d07c336ac9378ee09c4e0bd07935ba2c368a9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 16:25:11 +0000 Subject: [PATCH 36/86] typo --- .../simcore_service_autoscaling/utils/computational_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index 27a8ef1a718..89645a1d6d6 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -89,6 +89,6 @@ def try_assigning_task_to_instance_types( if assigned_tasks_to_instance_type.has_resources_for_task( task_required_resources ): - assigned_tasks_to_instance_type.assign_task(task) + assigned_tasks_to_instance_type.assign_task(task, task_required_resources) return True return False From 06867f03f0a7d197afc4234f1bf52def31802c3f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 12 Jan 2024 17:14:44 +0000 Subject: [PATCH 37/86] added delay for removing node --- .../modules/auto_scaling_core.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index edb03583803..e8a34b1935f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -5,7 +5,7 @@ import itertools import json import logging -from typing import cast +from typing import Final, cast import arrow from aws_library.ec2.models import ( @@ -111,11 +111,22 @@ def _node_not_ready(node: Node) -> bool: return cluster +_DELAY_FOR_REMOVING_DISCONNECTED_NODES_S: Final[int] = 30 + + async def _cleanup_disconnected_nodes(app: FastAPI, cluster: Cluster) -> Cluster: - if cluster.disconnected_nodes: - await utils_docker.remove_nodes( - get_docker_client(app), nodes=cluster.disconnected_nodes + utc_now = arrow.utcnow().datetime + removeable_nodes = [ + node + for node in cluster.disconnected_nodes + if node.UpdatedAt + and ( + (utc_now - arrow.get(node.UpdatedAt).datetime).total_seconds() + > _DELAY_FOR_REMOVING_DISCONNECTED_NODES_S ) + ] + if removeable_nodes: + await utils_docker.remove_nodes(get_docker_client(app), nodes=removeable_nodes) return dataclasses.replace(cluster, disconnected_nodes=[]) From 15c904a8075bacf9dafcd8bfaef2c612dca092e8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 13 Jan 2024 22:02:50 +0100 Subject: [PATCH 38/86] ruff --- .../src/simcore_service_autoscaling/core/application.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/application.py b/services/autoscaling/src/simcore_service_autoscaling/core/application.py index 5dab6100233..6bd496f0798 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/application.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/application.py @@ -1,10 +1,10 @@ import logging from fastapi import FastAPI +from models_library.basic_types import BootModeEnum from servicelib.fastapi.prometheus_instrumentation import ( setup_prometheus_instrumentation, ) -from models_library.basic_types import BootModeEnum from .._meta import ( API_VERSION, From e003438ca9e3a71d507b2581b9bc2be170a19c3d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 08:03:23 +0100 Subject: [PATCH 39/86] missing parameter --- .../src/simcore_service_autoscaling/utils/dynamic_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py index 5203dd17f0b..ce7fce4297a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py @@ -22,7 +22,7 @@ def try_assigning_task_to_instance_types( if assigned_tasks_to_instance_type.has_resources_for_task( task_required_resources ): - assigned_tasks_to_instance_type.assign_task(task) + assigned_tasks_to_instance_type.assign_task(task, task_required_resources) return True return False From 89415bd6187aae083fa17251a979637d3836c4bd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 08:37:01 +0100 Subject: [PATCH 40/86] keep task assignment in cluster --- .../autoscaling/src/simcore_service_autoscaling/models.py | 7 ++++++- .../modules/auto_scaling_core.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 27f0d1cc2b0..4157407add5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -34,10 +34,15 @@ def assign_task(self, task, task_resources: Resources) -> None: self.available_resources -= task_resources -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True, slots=True) class AssociatedInstance: node: Node ec2_instance: EC2InstanceData + assigned_tasks: list = field(default_factory=list) + _available_resources: Resources = field(init=False) + + def __post_init__(self) -> None: + super().__setattr__("_available_resources", self.ec2_instance.resources) @dataclass(frozen=True, kw_only=True, slots=True) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index e8a34b1935f..cd7206cfd12 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -338,6 +338,12 @@ async def _try_assign_task_to_instances( return False +async def _assign_tasks_to_current_cluster( + app: FastAPI, tasks: list, cluster: Cluster +) -> Cluster: + ... + + async def _find_needed_instances( app: FastAPI, pending_tasks: list, From 9c33fa731366f650e45988642910b729ff3fca0a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:40:36 +0000 Subject: [PATCH 41/86] preparing to add assigned tasks in cluster --- .../src/simcore_service_autoscaling/models.py | 40 ++++++++++++++++++- .../modules/auto_scaling_core.py | 36 ++++++++++++----- .../utils/auto_scaling_core.py | 4 +- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 4157407add5..18b4e6a225b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -42,7 +42,43 @@ class AssociatedInstance: _available_resources: Resources = field(init=False) def __post_init__(self) -> None: - super().__setattr__("_available_resources", self.ec2_instance.resources) + object.__setattr__(self, "_available_resources", self.ec2_instance.resources) + + def has_resources_for_task(self, task_resources: Resources) -> bool: + return bool(self._available_resources >= task_resources) + + def assign_task(self, task, task_resources: Resources) -> None: + self.assigned_tasks.append(task) + object.__setattr__( + self, "_available_resources", self._available_resources - task_resources + ) + + @property + def available_resources(self) -> Resources: + return self._available_resources + + +@dataclass(frozen=True, kw_only=True, slots=True) +class NonAssociatedInstance: + ec2_instance: EC2InstanceData + assigned_tasks: list = field(default_factory=list) + _available_resources: Resources = field(init=False) + + def __post_init__(self) -> None: + object.__setattr__(self, "_available_resources", self.ec2_instance.resources) + + def has_resources_for_task(self, task_resources: Resources) -> bool: + return bool(self._available_resources >= task_resources) + + def assign_task(self, task, task_resources: Resources) -> None: + self.assigned_tasks.append(task) + object.__setattr__( + self, "_available_resources", self._available_resources - task_resources + ) + + @property + def available_resources(self) -> Resources: + return self._available_resources @dataclass(frozen=True, kw_only=True, slots=True) @@ -67,7 +103,7 @@ class Cluster: "description": "This is a EC2 backed docker node which is drained in the reserve if this is enabled (with no tasks)" } ) - pending_ec2s: list[EC2InstanceData] = field( + pending_ec2s: list[NonAssociatedInstance] = field( metadata={ "description": "This is an EC2 instance that is not yet associated to a docker node" } diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index cd7206cfd12..a2f076c6e5b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -34,6 +34,7 @@ AssignedTasksToInstanceType, AssociatedInstance, Cluster, + NonAssociatedInstance, ) from ..utils import utils_docker, utils_ec2 from ..utils.auto_scaling_core import ( @@ -100,7 +101,7 @@ def _node_not_ready(node: Node) -> bool: reserve_drained_nodes=all_drained_nodes[ : app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ], - pending_ec2s=pending_ec2s, + pending_ec2s=[NonAssociatedInstance(ec2_instance=i) for i in pending_ec2s], terminated_instances=terminated_ec2_instances, disconnected_nodes=[n for n in docker_nodes if _node_not_ready(n)], ) @@ -135,12 +136,14 @@ async def _try_attach_pending_ec2s( ) -> Cluster: """label the drained instances that connected to the swarm which are missing the monitoring labels""" new_found_instances: list[AssociatedInstance] = [] - still_pending_ec2s: list[EC2InstanceData] = [] + still_pending_ec2s: list[NonAssociatedInstance] = [] app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec for instance_data in cluster.pending_ec2s: try: - node_host_name = node_host_name_from_ec2_private_dns(instance_data) + node_host_name = node_host_name_from_ec2_private_dns( + instance_data.ec2_instance + ) if new_node := await utils_docker.find_node_with_name( get_docker_client(app), node_host_name ): @@ -148,11 +151,19 @@ async def _try_attach_pending_ec2s( new_node = await utils_docker.tag_node( get_docker_client(app), new_node, - tags=auto_scaling_mode.get_new_node_docker_tags(app, instance_data), + tags=auto_scaling_mode.get_new_node_docker_tags( + app, instance_data.ec2_instance + ), available=False, ) - new_found_instances.append(AssociatedInstance(new_node, instance_data)) - _logger.info("Attached new EC2 instance %s", instance_data.id) + new_found_instances.append( + AssociatedInstance( + node=new_node, ec2_instance=instance_data.ec2_instance + ) + ) + _logger.info( + "Attached new EC2 instance %s", instance_data.ec2_instance.id + ) else: still_pending_ec2s.append(instance_data) except Ec2InvalidDnsNameError: # noqa: PERF203 @@ -364,9 +375,12 @@ async def _find_needed_instances( # NOTE: we add pending nodes to pending ec2, since they are both unavailable for now pending_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( - instance=i, assigned_tasks=[], available_resources=i.resources + instance=i, + assigned_tasks=[], + available_resources=i.resources, ) - for i in cluster.pending_ec2s + [i.ec2_instance for i in cluster.pending_nodes] + for i in [i.ec2_instance for i in cluster.pending_ec2s] + + [i.ec2_instance for i in cluster.pending_nodes] ] drained_ec2s_to_tasks: list[AssignedTasksToInstance] = [ AssignedTasksToInstance( @@ -662,7 +676,9 @@ async def _scale_up_cluster( f"{len(new_pending_instances)} new machines being started, please wait...", level=logging.INFO, ) - cluster.pending_ec2s.extend(new_pending_instances) + cluster.pending_ec2s.extend( + [NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances] + ) # NOTE: to check the logs of UserData in EC2 instance # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance @@ -712,7 +728,7 @@ async def _deactivate_empty_nodes( f"{[node.Description.Hostname for node in updated_nodes if node.Description]}", ) newly_drained_instances = [ - AssociatedInstance(node, instance.ec2_instance) + AssociatedInstance(node=node, ec2_instance=instance.ec2_instance) for instance, node in zip(active_empty_instances, updated_nodes, strict=True) ] return dataclasses.replace( diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py index 2b8504d7c8d..60e18ad58a6 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py @@ -72,7 +72,9 @@ def _find_node_with_name(node: Node) -> bool: continue if node := next(iter(filter(_find_node_with_name, nodes)), None): - associated_instances.append(AssociatedInstance(node, instance_data)) + associated_instances.append( + AssociatedInstance(node=node, ec2_instance=instance_data) + ) else: non_associated_instances.append(instance_data) return associated_instances, non_associated_instances From c09b5e79fe8de0a4464dcee6ef4a946068a66407 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:50:37 +0000 Subject: [PATCH 42/86] fix too many instances in --- .../modules/auto_scaling_core.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index a2f076c6e5b..42137730db8 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -81,14 +81,12 @@ def _node_not_ready(node: Node) -> bool: assert node.Status # nosec return bool(node.Status.State != NodeState.ready) - all_drained_nodes = [ - i for i in attached_ec2s if auto_scaling_mode.is_instance_drained(i) - ] - - active_nodes, pending_nodes = [], [] + active_nodes, pending_nodes, all_drained_nodes = [], [], [] for instance in attached_ec2s: if await auto_scaling_mode.is_instance_active(app, instance): active_nodes.append(instance) + elif auto_scaling_mode.is_instance_drained(instance): + all_drained_nodes.append(instance) else: pending_nodes.append(instance) From 4f2e7293fd53b7b1e788e7020a75cc1ea0a5969c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 12:14:59 +0000 Subject: [PATCH 43/86] allow to use replace --- .../autoscaling/src/simcore_service_autoscaling/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 18b4e6a225b..664c807b1fe 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -39,7 +39,7 @@ class AssociatedInstance: node: Node ec2_instance: EC2InstanceData assigned_tasks: list = field(default_factory=list) - _available_resources: Resources = field(init=False) + _available_resources: Resources = field(default_factory=Resources.create_as_empty) def __post_init__(self) -> None: object.__setattr__(self, "_available_resources", self.ec2_instance.resources) @@ -62,7 +62,7 @@ def available_resources(self) -> Resources: class NonAssociatedInstance: ec2_instance: EC2InstanceData assigned_tasks: list = field(default_factory=list) - _available_resources: Resources = field(init=False) + _available_resources: Resources = field(default_factory=Resources.create_as_empty) def __post_init__(self) -> None: object.__setattr__(self, "_available_resources", self.ec2_instance.resources) From 939636c18a8057f5dc0cf13f4454cad1ba9a8b21 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 12:18:48 +0000 Subject: [PATCH 44/86] refactor --- .../src/simcore_service_autoscaling/models.py | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 664c807b1fe..aa6745b7e8f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -35,8 +35,7 @@ def assign_task(self, task, task_resources: Resources) -> None: @dataclass(frozen=True, kw_only=True, slots=True) -class AssociatedInstance: - node: Node +class _BaseInstance: ec2_instance: EC2InstanceData assigned_tasks: list = field(default_factory=list) _available_resources: Resources = field(default_factory=Resources.create_as_empty) @@ -59,48 +58,35 @@ def available_resources(self) -> Resources: @dataclass(frozen=True, kw_only=True, slots=True) -class NonAssociatedInstance: - ec2_instance: EC2InstanceData - assigned_tasks: list = field(default_factory=list) - _available_resources: Resources = field(default_factory=Resources.create_as_empty) - - def __post_init__(self) -> None: - object.__setattr__(self, "_available_resources", self.ec2_instance.resources) - - def has_resources_for_task(self, task_resources: Resources) -> bool: - return bool(self._available_resources >= task_resources) +class AssociatedInstance(_BaseInstance): + node: Node - def assign_task(self, task, task_resources: Resources) -> None: - self.assigned_tasks.append(task) - object.__setattr__( - self, "_available_resources", self._available_resources - task_resources - ) - @property - def available_resources(self) -> Resources: - return self._available_resources +@dataclass(frozen=True, kw_only=True, slots=True) +class NonAssociatedInstance(_BaseInstance): + ... @dataclass(frozen=True, kw_only=True, slots=True) class Cluster: active_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is active and ready to receive tasks (or with running tasks)" + "description": "This is a EC2-backed docker node which is active and ready to receive tasks (or with running tasks)" } ) pending_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is active and NOT yet ready to receive tasks" + "description": "This is a EC2-backed docker node which is active and NOT yet ready to receive tasks" } ) drained_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is drained (cannot accept tasks)" + "description": "This is a EC2-backed docker node which is drained (cannot accept tasks)" } ) reserve_drained_nodes: list[AssociatedInstance] = field( metadata={ - "description": "This is a EC2 backed docker node which is drained in the reserve if this is enabled (with no tasks)" + "description": "This is a EC2-backed docker node which is drained in the reserve if this is enabled (with no tasks)" } ) pending_ec2s: list[NonAssociatedInstance] = field( From 734138486adbb5d6c95311c7ed29617585eee44b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 12:19:08 +0000 Subject: [PATCH 45/86] analyze with used resources --- .../modules/auto_scaling_core.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 42137730db8..30e19e510aa 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -84,7 +84,16 @@ def _node_not_ready(node: Node) -> bool: active_nodes, pending_nodes, all_drained_nodes = [], [], [] for instance in attached_ec2s: if await auto_scaling_mode.is_instance_active(app, instance): - active_nodes.append(instance) + node_used_resources = await auto_scaling_mode.compute_node_used_resources( + app, instance + ) + active_nodes.append( + dataclasses.replace( + instance, + _available_resources=instance.ec2_instance.resources + - node_used_resources, + ) + ) elif auto_scaling_mode.is_instance_drained(instance): all_drained_nodes.append(instance) else: From 5dc9183c87e5778cb35e132629c3bd179a5025ce Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:14:51 +0000 Subject: [PATCH 46/86] in memory tasks only take memory --- .../modules/auto_scaling_mode_computational.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index e2452f78203..84d08cc48ce 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -140,7 +140,7 @@ async def compute_node_used_resources( ) if num_results_in_memory > 0: # NOTE: this is a trick to consider the node still useful - return Resources(cpus=1, ram=ByteSize()) + return Resources(cpus=0, ram=ByteSize(1024 * 1024 * 1024)) return await dask.get_worker_used_resources( _scheduler_url(app), instance.ec2_instance ) From edaf0ccdfd7b73f90bb18d5b1297d2da5cf7b0c9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:16:58 +0000 Subject: [PATCH 47/86] refactor --- .../modules/auto_scaling_core.py | 248 ++++++++++-------- 1 file changed, 140 insertions(+), 108 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 30e19e510aa..273cd49ff3a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -222,72 +222,44 @@ async def _activate_and_notify( app: FastAPI, auto_scaling_mode: BaseAutoscaling, drained_node: AssociatedInstance, - tasks: list, -) -> list: +) -> None: await asyncio.gather( utils_docker.set_node_availability( get_docker_client(app), drained_node.node, available=True ), auto_scaling_mode.log_message_from_tasks( app, - tasks, + drained_node.assigned_tasks, "cluster adjusted, service should start shortly...", level=logging.INFO, ), - auto_scaling_mode.progress_message_from_tasks(app, tasks, progress=1.0), + auto_scaling_mode.progress_message_from_tasks( + app, drained_node.assigned_tasks, progress=1.0 + ), ) - return tasks async def _activate_drained_nodes( app: FastAPI, cluster: Cluster, - pending_tasks: list, auto_scaling_mode: BaseAutoscaling, -) -> tuple[list, Cluster]: - """returns the tasks that were assigned to the drained nodes""" - if not pending_tasks: - # nothing to do - return [], cluster - - activatable_instances: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=node.ec2_instance, - available_resources=node.ec2_instance.resources, - assigned_tasks=[], - ) +) -> Cluster: + nodes_to_activate = [ + node for node in itertools.chain( cluster.drained_nodes, cluster.reserve_drained_nodes ) - ] - - still_pending_tasks = [ - task - for task in pending_tasks - if not await auto_scaling_mode.try_assigning_task_to_instances( - app, task, activatable_instances, notify_progress=False - ) - ] - - nodes_to_activate = [ - (node, assigned_tasks_to_instance.assigned_tasks) - for assigned_tasks_to_instance, node in zip( - activatable_instances, - itertools.chain(cluster.drained_nodes, cluster.reserve_drained_nodes), - strict=True, - ) - if assigned_tasks_to_instance.assigned_tasks + if node.assigned_tasks ] # activate these nodes now await asyncio.gather( *( - _activate_and_notify(app, auto_scaling_mode, node, tasks) - for node, tasks in nodes_to_activate + _activate_and_notify(app, auto_scaling_mode, node) + for node in nodes_to_activate ) ) - new_active_nodes = [node for node, _ in nodes_to_activate] - new_active_node_ids = {node.ec2_instance.id for node in new_active_nodes} + new_active_node_ids = {node.ec2_instance.id for node in nodes_to_activate} remaining_drained_nodes = [ node for node in cluster.drained_nodes @@ -298,9 +270,9 @@ async def _activate_drained_nodes( for node in cluster.reserve_drained_nodes if node.ec2_instance.id not in new_active_node_ids ] - return still_pending_tasks, dataclasses.replace( + return dataclasses.replace( cluster, - active_nodes=cluster.active_nodes + new_active_nodes, + active_nodes=cluster.active_nodes + nodes_to_activate, drained_nodes=remaining_drained_nodes, reserve_drained_nodes=remaining_reserved_drained_nodes, ) @@ -356,78 +328,133 @@ async def _try_assign_task_to_instances( return False +def _try_assign_task_to_ec2_instance( + task, + *, + instances: list[AssociatedInstance] | list[NonAssociatedInstance], + task_required_ec2_instance: InstanceTypeType | None, + task_required_resources: Resources, +) -> bool: + for instance in instances: + if task_required_ec2_instance and ( + task_required_ec2_instance != instance.ec2_instance.type + ): + continue + if instance.has_resources_for_task(task_required_resources): + _logger.info( + "%s", + f"assigning task with {task_required_resources=}, {task_required_ec2_instance=} to " + f"{instance.ec2_instance.id=},{instance.ec2_instance.type}: {instance.available_resources}/{instance.ec2_instance.resources}", + ) + instance.assign_task(task, task_required_resources) + return True + return False + + +def _try_assign_task_to_ec2_instance_type( + task, + *, + instances: list[AssignedTasksToInstanceType], + task_required_ec2_instance: InstanceTypeType | None, + task_required_resources: Resources, +) -> bool: + for instance in instances: + if task_required_ec2_instance and ( + task_required_ec2_instance != instance.instance_type + ): + continue + if instance.has_resources_for_task(task_required_resources): + _logger.info( + "%s", + f"assigning task with {task_required_resources=}, {task_required_ec2_instance=} to " + f"{instance.instance_type}: {instance.available_resources}/{instance.instance_type.resources}", + ) + instance.assign_task(task, task_required_resources) + return True + return False + + async def _assign_tasks_to_current_cluster( - app: FastAPI, tasks: list, cluster: Cluster -) -> Cluster: - ... + app: FastAPI, + tasks: list, + cluster: Cluster, + auto_scaling_mode: BaseAutoscaling, +) -> tuple[list, Cluster]: + unnassigned_tasks = [] + for task in tasks: + task_required_resources = auto_scaling_mode.get_task_required_resources(task) + task_required_ec2_instance = await auto_scaling_mode.get_task_defined_instance( + app, task + ) + + all_drained_nodes = cluster.drained_nodes + cluster.reserve_drained_nodes + if _try_assign_task_to_ec2_instance( + task, + instances=cluster.active_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ): + _logger.info("assigned task to active nodes") + elif _try_assign_task_to_ec2_instance( + task, + instances=all_drained_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ): + _logger.info("assigned task to drained nodes") + elif _try_assign_task_to_ec2_instance( + task, + instances=cluster.pending_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ): + _logger.info("assigned task to pending nodes") + elif _try_assign_task_to_ec2_instance( + task, + instances=cluster.pending_ec2s, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ): + _logger.info("assigned task to pending ec2s") + else: + unnassigned_tasks.append(task) + _logger.info("assigned task to nothing") + return unnassigned_tasks, cluster async def _find_needed_instances( app: FastAPI, - pending_tasks: list, + unassigned_tasks: list, available_ec2_types: list[EC2InstanceType], cluster: Cluster, auto_scaling_mode: BaseAutoscaling, ) -> dict[EC2InstanceType, int]: # 1. check first the pending task needs - active_ec2s_to_tasks: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=i.ec2_instance, - assigned_tasks=[], - available_resources=i.ec2_instance.resources - - await auto_scaling_mode.compute_node_used_resources(app, i), - ) - for i in cluster.active_nodes - ] - # NOTE: we add pending nodes to pending ec2, since they are both unavailable for now - pending_ec2s_to_tasks: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=i, - assigned_tasks=[], - available_resources=i.resources, - ) - for i in [i.ec2_instance for i in cluster.pending_ec2s] - + [i.ec2_instance for i in cluster.pending_nodes] - ] - drained_ec2s_to_tasks: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=i.ec2_instance, - assigned_tasks=[], - available_resources=i.ec2_instance.resources, - ) - for i in cluster.drained_nodes - ] needed_new_instance_types_for_tasks: list[AssignedTasksToInstanceType] = [] with log_context(_logger, logging.DEBUG, msg="finding needed instances"): - for task in pending_tasks: - task_defined_ec2_type = await auto_scaling_mode.get_task_defined_instance( - app, task + for task in unassigned_tasks: + task_required_resources = auto_scaling_mode.get_task_required_resources( + task ) - _logger.debug( - "task %s %s", - task, - f"defines ec2 type as {task_defined_ec2_type}" - if task_defined_ec2_type - else "does NOT define ec2 type", + task_required_ec2_instance = ( + await auto_scaling_mode.get_task_defined_instance(app, task) ) - if await _try_assign_task_to_instances( - app, + + # first check if we can assign the task to one of the newly tobe created instances + if _try_assign_task_to_ec2_instance_type( task, - auto_scaling_mode, - task_defined_ec2_type, - active_ec2s_to_tasks, - pending_ec2s_to_tasks, - drained_ec2s_to_tasks, - needed_new_instance_types_for_tasks, + instances=needed_new_instance_types_for_tasks, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, ): continue # so we need to find what we can create now try: # check if exact instance type is needed first - if task_defined_ec2_type: + if task_required_ec2_instance: defined_ec2 = find_selected_instance_type_for_task( - task_defined_ec2_type, + task_required_ec2_instance, available_ec2_types, auto_scaling_mode, task, @@ -480,8 +507,8 @@ async def _find_needed_instances( ) > 0: # check if some are already pending remaining_pending_instances = [ - i.instance for i in pending_ec2s_to_tasks if not i.assigned_tasks - ] + i.ec2_instance for i in cluster.pending_ec2s if not i.assigned_tasks + ] + [i.ec2_instance for i in cluster.pending_nodes if not i.assigned_tasks] if len(remaining_pending_instances) < ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - len(cluster.reserve_drained_nodes) @@ -653,7 +680,7 @@ async def _start_instances( async def _scale_up_cluster( app: FastAPI, cluster: Cluster, - pending_tasks: list, + unassigned_tasks: list, auto_scaling_mode: BaseAutoscaling, ) -> Cluster: app_settings: ApplicationSettings = app.state.settings @@ -664,22 +691,24 @@ async def _scale_up_cluster( # let's start these if needed_ec2_instances := await _find_needed_instances( - app, pending_tasks, allowed_instance_types, cluster, auto_scaling_mode + app, unassigned_tasks, allowed_instance_types, cluster, auto_scaling_mode ): await auto_scaling_mode.log_message_from_tasks( app, - pending_tasks, + unassigned_tasks, "service is pending due to missing resources, scaling up cluster now...", level=logging.INFO, ) # NOTE: notify the up-scaling progress started... - await auto_scaling_mode.progress_message_from_tasks(app, pending_tasks, 0.001) + await auto_scaling_mode.progress_message_from_tasks( + app, unassigned_tasks, 0.001 + ) new_pending_instances = await _start_instances( - app, needed_ec2_instances, pending_tasks, auto_scaling_mode + app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode ) await auto_scaling_mode.log_message_from_tasks( app, - pending_tasks, + unassigned_tasks, f"{len(new_pending_instances)} new machines being started, please wait...", level=logging.INFO, ) @@ -827,15 +856,17 @@ async def _autoscale_cluster( # 1. check if we have pending tasks and resolve them by activating some drained nodes unrunnable_tasks = await auto_scaling_mode.list_unrunnable_tasks(app) _logger.info("found %s unrunnable tasks", len(unrunnable_tasks)) - # 2. try to activate drained nodes to cover some of the tasks - still_unrunnable_tasks, cluster = await _activate_drained_nodes( - app, cluster, unrunnable_tasks, auto_scaling_mode + + queued_or_missing_instance_tasks, cluster = await _assign_tasks_to_current_cluster( + app, unrunnable_tasks, cluster, auto_scaling_mode ) + # 2. try to activate drained nodes to cover some of the tasks + cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode) # let's check if there are still pending tasks or if the reserve was used app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - if still_unrunnable_tasks or ( + if queued_or_missing_instance_tasks or ( len(cluster.reserve_drained_nodes) < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): @@ -846,20 +877,20 @@ async def _autoscale_cluster( # ): _logger.info( "still %s unrunnable tasks after node activation, try to scale up...", - len(still_unrunnable_tasks), + len(queued_or_missing_instance_tasks), ) # yes? then scale up cluster = await _scale_up_cluster( - app, cluster, still_unrunnable_tasks, auto_scaling_mode + app, cluster, queued_or_missing_instance_tasks, auto_scaling_mode ) # give feedback on machine creation elif ( - len(still_unrunnable_tasks) == len(unrunnable_tasks) == 0 + len(queued_or_missing_instance_tasks) == len(unrunnable_tasks) == 0 and cluster.can_scale_down() ): _logger.info( "there is %s waiting task, try to scale down...", - len(still_unrunnable_tasks), + len(queued_or_missing_instance_tasks), ) # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust await auto_scaling_mode.try_retire_nodes(app) @@ -906,6 +937,7 @@ async def auto_scale_cluster( cluster = await _analyze_current_cluster(app, auto_scaling_mode) cluster = await _cleanup_disconnected_nodes(app, cluster) cluster = await _try_attach_pending_ec2s(app, cluster, auto_scaling_mode) + cluster = await _autoscale_cluster(app, cluster, auto_scaling_mode) await _notify_autoscaling_status(app, cluster, auto_scaling_mode) From ab66063c5bc38091ab1243d0755fade5b9643d02 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:38:44 +0000 Subject: [PATCH 48/86] refactoring notifications --- .../modules/auto_scaling_core.py | 88 ++++------ .../modules/auto_scaling_mode_base.py | 25 +-- .../auto_scaling_mode_computational.py | 33 +--- .../modules/auto_scaling_mode_dynamic.py | 31 +--- .../utils/auto_scaling_core.py | 68 +------- .../utils/computational_scaling.py | 61 +------ .../utils/dynamic_scaling.py | 15 +- .../unit/test_utils_computational_scaling.py | 151 +----------------- 8 files changed, 37 insertions(+), 435 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 273cd49ff3a..bec8ea2c88e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -19,6 +19,7 @@ from fastapi.encoders import jsonable_encoder from models_library.generated_models.docker_rest_api import Node, NodeState from servicelib.logging_utils import log_catch, log_context +from servicelib.utils_formatting import timedelta_as_minute_second from types_aiobotocore_ec2.literals import InstanceTypeType from ..core.errors import ( @@ -30,7 +31,6 @@ ) from ..core.settings import ApplicationSettings, get_application_settings from ..models import ( - AssignedTasksToInstance, AssignedTasksToInstanceType, AssociatedInstance, Cluster, @@ -40,7 +40,6 @@ from ..utils.auto_scaling_core import ( associate_ec2_instances_with_nodes, ec2_startup_script, - filter_by_task_defined_instance, find_selected_instance_type_for_task, node_host_name_from_ec2_private_dns, ) @@ -278,56 +277,6 @@ async def _activate_drained_nodes( ) -async def _try_assign_task_to_instances( - app: FastAPI, - task, - auto_scaling_mode: BaseAutoscaling, - task_defined_ec2_type: InstanceTypeType | None, - active_instances_to_tasks: list[AssignedTasksToInstance], - pending_instances_to_tasks: list[AssignedTasksToInstance], - drained_instances_to_tasks: list[AssignedTasksToInstance], - needed_new_instance_types_for_tasks: list[AssignedTasksToInstanceType], -) -> bool: - ( - filtered_active_instance_to_task, - filtered_pending_instance_to_task, - filtered_drained_instances_to_task, - filtered_needed_new_instance_types_to_task, - ) = filter_by_task_defined_instance( - task_defined_ec2_type, - active_instances_to_tasks, - pending_instances_to_tasks, - drained_instances_to_tasks, - needed_new_instance_types_for_tasks, - ) - # try to assign the task to one of the active, pending or newly created instances - if ( - await auto_scaling_mode.try_assigning_task_to_instances( - app, - task, - filtered_active_instance_to_task, - notify_progress=False, - ) - or await auto_scaling_mode.try_assigning_task_to_instances( - app, - task, - filtered_pending_instance_to_task, - notify_progress=True, - ) - or await auto_scaling_mode.try_assigning_task_to_instances( - app, - task, - filtered_drained_instances_to_task, - notify_progress=False, - ) - or auto_scaling_mode.try_assigning_task_to_instance_types( - task, filtered_needed_new_instance_types_to_task - ) - ): - return True - return False - - def _try_assign_task_to_ec2_instance( task, *, @@ -850,6 +799,29 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: # 4. +async def _notify_machine_creation_progress(app: FastAPI, cluster: Cluster) -> None: + app_settings = get_application_settings(app) + assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec + instance_max_time_to_start = ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME + ) + now = datetime.datetime.now(datetime.timezone.utc) + for instance in cluster.pending_ec2s: + time_since_launch = now - instance.ec2_instance.launch_time + estimated_time_to_completion = ( + instance.ec2_instance.launch_time + instance_max_time_to_start - now + ) + _logger.info( + "LOG: %s", + f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}," + f" est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait...", + ) + _logger.info( + "PROGRESS: %s", + f"{time_since_launch.total_seconds() / instance_max_time_to_start.total_seconds():.2f}", + ) + + async def _autoscale_cluster( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling ) -> Cluster: @@ -870,26 +842,20 @@ async def _autoscale_cluster( len(cluster.reserve_drained_nodes) < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): - # we might want to scale up if we do not already have reached the maximum amount of machines - # if ( - # cluster.total_number_of_machines() - # < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES - # ): _logger.info( "still %s unrunnable tasks after node activation, try to scale up...", len(queued_or_missing_instance_tasks), ) - # yes? then scale up cluster = await _scale_up_cluster( app, cluster, queued_or_missing_instance_tasks, auto_scaling_mode ) - # give feedback on machine creation + elif ( len(queued_or_missing_instance_tasks) == len(unrunnable_tasks) == 0 and cluster.can_scale_down() ): _logger.info( - "there is %s waiting task, try to scale down...", + "there is %s waiting task, slowly and gracefully scaling down...", len(queued_or_missing_instance_tasks), ) # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust @@ -939,5 +905,5 @@ async def auto_scale_cluster( cluster = await _try_attach_pending_ec2s(app, cluster, auto_scaling_mode) cluster = await _autoscale_cluster(app, cluster, auto_scaling_mode) - + await _notify_machine_creation_progress(app, cluster) await _notify_autoscaling_status(app, cluster, auto_scaling_mode) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py index d967b1a47f4..070dad548f9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_base.py @@ -9,11 +9,7 @@ from servicelib.logging_utils import LogLevelInt from types_aiobotocore_ec2.literals import InstanceTypeType -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, -) +from ..models import AssociatedInstance from ..utils import utils_docker @@ -41,25 +37,6 @@ def get_new_node_docker_tags( async def list_unrunnable_tasks(app: FastAPI) -> list: ... - @staticmethod - @abstractmethod - async def try_assigning_task_to_instances( - app: FastAPI, - pending_task, - instances_to_tasks: list[AssignedTasksToInstance], - *, - notify_progress: bool - ) -> bool: - ... - - @staticmethod - @abstractmethod - def try_assigning_task_to_instance_types( - pending_task, - instance_types_to_tasks: list[AssignedTasksToInstanceType], - ) -> bool: - ... - @staticmethod @abstractmethod async def log_message_from_tasks( diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 84d08cc48ce..433139e343c 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -20,12 +20,7 @@ DaskWorkerNotFoundError, ) from ..core.settings import get_application_settings -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, - DaskTask, -) +from ..models import AssociatedInstance, DaskTask from ..utils import computational_scaling as utils from ..utils import utils_docker, utils_ec2 from . import dask @@ -71,7 +66,7 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: queued_tasks = [] for tasks in processing_tasks_by_worker.values(): queued_tasks += tasks[1:] - _logger.info( + _logger.debug( "found %s unrunnable tasks and %s potentially queued tasks", len(unrunnable_tasks), len(queued_tasks), @@ -83,30 +78,6 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: ) return [] - @staticmethod - async def try_assigning_task_to_instances( - app: FastAPI, - pending_task, - instances_to_tasks: list[AssignedTasksToInstance], - *, - notify_progress: bool, - ) -> bool: - return await utils.try_assigning_task_to_instances( - app, - pending_task, - instances_to_tasks, - notify_progress=notify_progress, - ) - - @staticmethod - def try_assigning_task_to_instance_types( - pending_task, - instance_types_to_tasks: list[AssignedTasksToInstanceType], - ) -> bool: - return utils.try_assigning_task_to_instance_types( - pending_task, instance_types_to_tasks - ) - @staticmethod async def log_message_from_tasks( app: FastAPI, tasks: list, message: str, *, level: LogLevelInt diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py index e3205dde911..76e808b1b60 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_dynamic.py @@ -6,12 +6,7 @@ from types_aiobotocore_ec2.literals import InstanceTypeType from ..core.settings import get_application_settings -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, -) -from ..utils import dynamic_scaling as utils +from ..models import AssociatedInstance from ..utils import utils_docker, utils_ec2 from ..utils.rabbitmq import log_tasks_message, progress_tasks_message from .auto_scaling_mode_base import BaseAutoscaling @@ -49,30 +44,6 @@ async def list_unrunnable_tasks(app: FastAPI) -> list[Task]: service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS, ) - @staticmethod - async def try_assigning_task_to_instances( - app: FastAPI, - pending_task, - instances_to_tasks: list[AssignedTasksToInstance], - *, - notify_progress: bool - ) -> bool: - return await utils.try_assigning_task_to_instances( - app, - pending_task, - instances_to_tasks, - notify_progress=notify_progress, - ) - - @staticmethod - def try_assigning_task_to_instance_types( - pending_task, - instance_types_to_tasks: list[AssignedTasksToInstanceType], - ) -> bool: - return utils.try_assigning_task_to_instance_types( - pending_task, instance_types_to_tasks - ) - @staticmethod async def log_message_from_tasks( app: FastAPI, tasks: list, message: str, *, level: LogLevelInt diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py index 60e18ad58a6..0cfd72d12f5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/auto_scaling_core.py @@ -13,11 +13,7 @@ from ..core.errors import Ec2InstanceInvalidError, Ec2InvalidDnsNameError from ..core.settings import ApplicationSettings -from ..models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, -) +from ..models import AssociatedInstance from ..modules.auto_scaling_mode_base import BaseAutoscaling from . import utils_docker @@ -116,68 +112,6 @@ def _instance_type_by_type_name( return bool(ec2_type.name == type_name) -def _instance_type_map_by_type_name( - mapping: AssignedTasksToInstanceType, *, type_name: InstanceTypeType | None -) -> bool: - return _instance_type_by_type_name(mapping.instance_type, type_name=type_name) - - -def _instance_data_map_by_type_name( - mapping: AssignedTasksToInstance, *, type_name: InstanceTypeType | None -) -> bool: - if type_name is None: - return True - return bool(mapping.instance.type == type_name) - - -def filter_by_task_defined_instance( - instance_type_name: InstanceTypeType | None, - active_instances_to_tasks: list[AssignedTasksToInstance], - pending_instances_to_tasks: list[AssignedTasksToInstance], - drained_instances_to_tasks: list[AssignedTasksToInstance], - needed_new_instance_types_for_tasks: list[AssignedTasksToInstanceType], -) -> tuple[ - list[AssignedTasksToInstance], - list[AssignedTasksToInstance], - list[AssignedTasksToInstance], - list[AssignedTasksToInstanceType], -]: - return ( - list( - filter( - functools.partial( - _instance_data_map_by_type_name, type_name=instance_type_name - ), - active_instances_to_tasks, - ) - ), - list( - filter( - functools.partial( - _instance_data_map_by_type_name, type_name=instance_type_name - ), - pending_instances_to_tasks, - ) - ), - list( - filter( - functools.partial( - _instance_data_map_by_type_name, type_name=instance_type_name - ), - drained_instances_to_tasks, - ) - ), - list( - filter( - functools.partial( - _instance_type_map_by_type_name, type_name=instance_type_name - ), - needed_new_instance_types_for_tasks, - ) - ), - ) - - def find_selected_instance_type_for_task( instance_type_name: InstanceTypeType, available_ec2_types: list[EC2InstanceType], diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index 89645a1d6d6..6f61f69a2f2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -1,4 +1,3 @@ -import datetime import logging from typing import Final @@ -6,11 +5,8 @@ from dask_task_models_library.resource_constraints import ( get_ec2_instance_type_from_resources, ) -from fastapi import FastAPI -from servicelib.utils_formatting import timedelta_as_minute_second -from ..core.settings import get_application_settings -from ..models import AssignedTasksToInstance, AssignedTasksToInstanceType, DaskTask +from ..models import DaskTask _logger = logging.getLogger(__name__) @@ -37,58 +33,3 @@ def _compute_tasks_resources(tasks: list[DaskTask]) -> Resources: (resources_from_dask_task(t) for t in tasks), Resources.create_as_empty(), ) - - -async def try_assigning_task_to_instances( - app: FastAPI, - task: DaskTask, - instances_to_tasks: list[AssignedTasksToInstance], - *, - notify_progress: bool, -) -> bool: - app_settings = get_application_settings(app) - assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - instance_max_time_to_start = ( - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME - ) - task_required_resources = resources_from_dask_task(task) - for assigned_tasks_to_instance in instances_to_tasks: - if assigned_tasks_to_instance.has_resources_for_task(task_required_resources): - assigned_tasks_to_instance.assign_task(task, task_required_resources) - - if notify_progress: - now = datetime.datetime.now(datetime.timezone.utc) - time_since_launch = ( - now - assigned_tasks_to_instance.instance.launch_time - ) - estimated_time_to_completion = ( - assigned_tasks_to_instance.instance.launch_time - + instance_max_time_to_start - - now - ) - _logger.info( - "LOG: %s", - f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}," - f" est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait...", - ) - _logger.info( - "PROGRESS: %s", - time_since_launch.total_seconds() - / instance_max_time_to_start.total_seconds(), - ) - return True - return False - - -def try_assigning_task_to_instance_types( - task: DaskTask, - instance_types_to_tasks: list[AssignedTasksToInstanceType], -) -> bool: - task_required_resources = resources_from_dask_task(task) - for assigned_tasks_to_instance_type in instance_types_to_tasks: - if assigned_tasks_to_instance_type.has_resources_for_task( - task_required_resources - ): - assigned_tasks_to_instance_type.assign_task(task, task_required_resources) - return True - return False diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py index ce7fce4297a..a5f1c56fea3 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py @@ -6,25 +6,14 @@ from servicelib.utils_formatting import timedelta_as_minute_second from ..core.settings import get_application_settings -from ..models import AssignedTasksToInstance, AssignedTasksToInstanceType +from ..models import AssignedTasksToInstance from . import utils_docker from .rabbitmq import log_tasks_message, progress_tasks_message logger = logging.getLogger(__name__) -def try_assigning_task_to_instance_types( - task: Task, - instance_types_to_tasks: list[AssignedTasksToInstanceType], -) -> bool: - task_required_resources = utils_docker.get_max_resources_from_docker_task(task) - for assigned_tasks_to_instance_type in instance_types_to_tasks: - if assigned_tasks_to_instance_type.has_resources_for_task( - task_required_resources - ): - assigned_tasks_to_instance_type.assign_task(task, task_required_resources) - return True - return False +# NOTE: DEPRECATED module!!! kept until the logging is moved to auto_scaling_core.py async def try_assigning_task_to_instances( diff --git a/services/autoscaling/tests/unit/test_utils_computational_scaling.py b/services/autoscaling/tests/unit/test_utils_computational_scaling.py index 750f43e2b03..7341c9d470e 100644 --- a/services/autoscaling/tests/unit/test_utils_computational_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_computational_scaling.py @@ -3,30 +3,15 @@ # pylint: disable=unused-variable # pylint: disable=too-many-arguments -import datetime -from collections.abc import Callable -from unittest import mock import pytest -from aws_library.ec2.models import EC2InstanceType, Resources -from faker import Faker -from models_library.generated_models.docker_rest_api import Node as DockerNode +from aws_library.ec2.models import Resources from pydantic import ByteSize, parse_obj_as -from pytest_mock import MockerFixture -from simcore_service_autoscaling.models import ( - AssignedTasksToInstance, - AssignedTasksToInstanceType, - AssociatedInstance, - DaskTask, - DaskTaskResources, - EC2InstanceData, -) +from simcore_service_autoscaling.models import DaskTask, DaskTaskResources from simcore_service_autoscaling.utils.computational_scaling import ( _DEFAULT_MAX_CPU, _DEFAULT_MAX_RAM, resources_from_dask_task, - try_assigning_task_to_instance_types, - try_assigning_task_to_instances, ) @@ -65,135 +50,3 @@ ) def test_resources_from_dask_task(dask_task: DaskTask, expected_resource: Resources): assert resources_from_dask_task(dask_task) == expected_resource - - -@pytest.fixture -def fake_app(mocker: MockerFixture) -> mock.Mock: - app = mocker.Mock() - app.state.settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME = ( - datetime.timedelta(minutes=1) - ) - return app - - -@pytest.fixture -def fake_task(faker: Faker) -> Callable[..., DaskTask]: - def _creator(**overrides) -> DaskTask: - return DaskTask( - **( - { - "task_id": faker.uuid4(), - "required_resources": DaskTaskResources(faker.pydict()), - } - | overrides - ) - ) - - return _creator - - -@pytest.fixture -def fake_associated_host_instance( - host_node: DockerNode, - fake_ec2_instance_data: Callable[..., EC2InstanceData], -) -> AssociatedInstance: - return AssociatedInstance( - host_node, - fake_ec2_instance_data(), - ) - - -async def test_try_assigning_task_to_instances_with_no_instances( - fake_app: mock.Mock, - fake_task: Callable[..., DaskTask], -): - task = fake_task() - assert ( - await try_assigning_task_to_instances(fake_app, task, [], notify_progress=True) - is False - ) - - -async def test_try_assigning_task_to_instances( - fake_app: mock.Mock, - fake_task: Callable[..., DaskTask], - fake_ec2_instance_data: Callable[..., EC2InstanceData], -): - task = fake_task(required_resources={"CPU": 2}) - ec2_instance = fake_ec2_instance_data() - pending_instance_to_tasks: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=ec2_instance, - assigned_tasks=[], - available_resources=Resources(cpus=4, ram=ByteSize(1024**2)), - ) - ] - - # calling once should allow to add that task to the instance - assert ( - await try_assigning_task_to_instances( - fake_app, - task, - pending_instance_to_tasks, - notify_progress=True, - ) - is True - ) - assert pending_instance_to_tasks[0].assigned_tasks == [task] - # calling a second time as well should allow to add that task to the instance - assert ( - await try_assigning_task_to_instances( - fake_app, - task, - pending_instance_to_tasks, - notify_progress=True, - ) - is True - ) - assert pending_instance_to_tasks[0].assigned_tasks == [task, task] - # calling a third time should fail - assert ( - await try_assigning_task_to_instances( - fake_app, - task, - pending_instance_to_tasks, - notify_progress=True, - ) - is False - ) - assert pending_instance_to_tasks[0].assigned_tasks == [task, task] - - -def test_try_assigning_task_to_instance_types_with_empty_types( - fake_task: Callable[..., DaskTask] -): - task = fake_task(required_resources={"CPU": 2}) - assert try_assigning_task_to_instance_types(task, []) is False - - -def test_try_assigning_task_to_instance_types( - fake_task: Callable[..., DaskTask], faker: Faker -): - task = fake_task(required_resources={"CPU": 2}) - # create an instance type with some CPUs - fake_instance_type = EC2InstanceType( - name=faker.name(), - resources=Resources(cpus=6, ram=parse_obj_as(ByteSize, "2GiB")), - ) - instance_type_to_tasks: list[AssignedTasksToInstanceType] = [ - AssignedTasksToInstanceType( - instance_type=fake_instance_type, - assigned_tasks=[], - available_resources=fake_instance_type.resources, - ) - ] - # now this should work 3 times - assert try_assigning_task_to_instance_types(task, instance_type_to_tasks) is True - assert instance_type_to_tasks[0].assigned_tasks == [task] - assert try_assigning_task_to_instance_types(task, instance_type_to_tasks) is True - assert instance_type_to_tasks[0].assigned_tasks == [task, task] - assert try_assigning_task_to_instance_types(task, instance_type_to_tasks) is True - assert instance_type_to_tasks[0].assigned_tasks == [task, task, task] - # now it should fail - assert try_assigning_task_to_instance_types(task, instance_type_to_tasks) is False - assert instance_type_to_tasks[0].assigned_tasks == [task, task, task] From 69b6e8c80077103045cb6a5073fa91d9f7102bbf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:42:51 +0000 Subject: [PATCH 49/86] cleanup --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index bec8ea2c88e..0fce51ef624 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -843,7 +843,7 @@ async def _autoscale_cluster( < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): _logger.info( - "still %s unrunnable tasks after node activation, try to scale up...", + "%s unrunnable tasks could not be assigned to drained nodes, slowly trying to scale up...", len(queued_or_missing_instance_tasks), ) cluster = await _scale_up_cluster( From 8aaa247ccb6630de1e4a92a08e12d0c60afb7e6c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:56:41 +0000 Subject: [PATCH 50/86] refactor --- .../modules/auto_scaling_core.py | 85 ++++++++++--------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 0fce51ef624..3493772538e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -329,7 +329,7 @@ async def _assign_tasks_to_current_cluster( cluster: Cluster, auto_scaling_mode: BaseAutoscaling, ) -> tuple[list, Cluster]: - unnassigned_tasks = [] + unassigned_tasks = [] for task in tasks: task_required_resources = auto_scaling_mode.get_task_required_resources(task) task_required_ec2_instance = await auto_scaling_mode.get_task_defined_instance( @@ -337,38 +337,43 @@ async def _assign_tasks_to_current_cluster( ) all_drained_nodes = cluster.drained_nodes + cluster.reserve_drained_nodes - if _try_assign_task_to_ec2_instance( - task, - instances=cluster.active_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ): - _logger.info("assigned task to active nodes") - elif _try_assign_task_to_ec2_instance( - task, - instances=all_drained_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ): - _logger.info("assigned task to drained nodes") - elif _try_assign_task_to_ec2_instance( - task, - instances=cluster.pending_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ): - _logger.info("assigned task to pending nodes") - elif _try_assign_task_to_ec2_instance( - task, - instances=cluster.pending_ec2s, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, + if ( + _try_assign_task_to_ec2_instance( + task, + instances=cluster.active_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ) + or _try_assign_task_to_ec2_instance( + task, + instances=all_drained_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ) + or _try_assign_task_to_ec2_instance( + task, + instances=cluster.pending_nodes, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ) + or _try_assign_task_to_ec2_instance( + task, + instances=cluster.pending_ec2s, + task_required_ec2_instance=task_required_ec2_instance, + task_required_resources=task_required_resources, + ) ): - _logger.info("assigned task to pending ec2s") + _logger.info("assigned task to cluster") else: - unnassigned_tasks.append(task) - _logger.info("assigned task to nothing") - return unnassigned_tasks, cluster + unassigned_tasks.append(task) + + if unassigned_tasks: + _logger.info( + "the current cluster should cope with %s tasks, %s are unnassigned/queued tasks", + len(tasks) - len(unassigned_tasks), + len(unassigned_tasks), + ) + return unassigned_tasks, cluster async def _find_needed_instances( @@ -842,13 +847,17 @@ async def _autoscale_cluster( len(cluster.reserve_drained_nodes) < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ): - _logger.info( - "%s unrunnable tasks could not be assigned to drained nodes, slowly trying to scale up...", - len(queued_or_missing_instance_tasks), - ) - cluster = await _scale_up_cluster( - app, cluster, queued_or_missing_instance_tasks, auto_scaling_mode - ) + if ( + cluster.total_number_of_machines() + < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES + ): + _logger.info( + "%s unrunnable tasks could not be assigned, slowly trying to scale up...", + len(queued_or_missing_instance_tasks), + ) + cluster = await _scale_up_cluster( + app, cluster, queued_or_missing_instance_tasks, auto_scaling_mode + ) elif ( len(queued_or_missing_instance_tasks) == len(unrunnable_tasks) == 0 From e0794c5394ea654012a452ac493599eda68c9094 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 18:18:46 +0100 Subject: [PATCH 51/86] fix clusters-keeper --- .../api_schemas_clusters_keeper/ec2_instances.py | 4 ++-- .../simcore_service_clusters_keeper/rpc/ec2_instances.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/models-library/src/models_library/api_schemas_clusters_keeper/ec2_instances.py b/packages/models-library/src/models_library/api_schemas_clusters_keeper/ec2_instances.py index 16f47d2a3fd..057c02e1815 100644 --- a/packages/models-library/src/models_library/api_schemas_clusters_keeper/ec2_instances.py +++ b/packages/models-library/src/models_library/api_schemas_clusters_keeper/ec2_instances.py @@ -1,10 +1,10 @@ from dataclasses import dataclass -from pydantic import ByteSize, PositiveInt +from pydantic import ByteSize, NonNegativeFloat @dataclass(frozen=True) class EC2InstanceTypeGet: name: str - cpus: PositiveInt + cpus: NonNegativeFloat ram: ByteSize diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/ec2_instances.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/ec2_instances.py index a68760ad762..ab09c0a128e 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/ec2_instances.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/ec2_instances.py @@ -1,5 +1,3 @@ -from dataclasses import asdict - from aws_library.ec2.models import EC2InstanceType from fastapi import FastAPI from models_library.api_schemas_clusters_keeper.ec2_instances import EC2InstanceTypeGet @@ -17,4 +15,7 @@ async def get_instance_type_details( instance_capabilities: list[EC2InstanceType] = await get_ec2_client( app ).get_ec2_instance_capabilities(instance_type_names) - return [EC2InstanceTypeGet(**asdict(t)) for t in instance_capabilities] + return [ + EC2InstanceTypeGet(name=t.name, cpus=t.resources.cpus, ram=t.resources.ram) + for t in instance_capabilities + ] From f839aed5a6e16ddacd97e75629c710552f5a8a5b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 18:29:44 +0100 Subject: [PATCH 52/86] refactoring --- .../modules/auto_scaling_core.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 3493772538e..a9daa9edc98 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -336,32 +336,36 @@ async def _assign_tasks_to_current_cluster( app, task ) - all_drained_nodes = cluster.drained_nodes + cluster.reserve_drained_nodes - if ( - _try_assign_task_to_ec2_instance( + assignment_functions = [ + lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance( task, instances=cluster.active_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ) - or _try_assign_task_to_ec2_instance( + task_required_ec2_instance=required_ec2, + task_required_resources=required_resources, + ), + lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance( task, - instances=all_drained_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ) - or _try_assign_task_to_ec2_instance( + instances=cluster.drained_nodes + cluster.reserve_drained_nodes, + task_required_ec2_instance=required_ec2, + task_required_resources=required_resources, + ), + lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance( task, instances=cluster.pending_nodes, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ) - or _try_assign_task_to_ec2_instance( + task_required_ec2_instance=required_ec2, + task_required_resources=required_resources, + ), + lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance( task, instances=cluster.pending_ec2s, - task_required_ec2_instance=task_required_ec2_instance, - task_required_resources=task_required_resources, - ) + task_required_ec2_instance=required_ec2, + task_required_resources=required_resources, + ), + ] + + if any( + assignment(task, task_required_ec2_instance, task_required_resources) + for assignment in assignment_functions ): _logger.info("assigned task to cluster") else: From 71ba6b0afd96b7e3f5a44b2c57ac23a97c53c2c1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 18:31:27 +0100 Subject: [PATCH 53/86] minor --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index a9daa9edc98..be47e444306 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -804,8 +804,6 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: terminated_instances=cluster.terminated_instances + [i.ec2_instance for i in terminateable_instances], ) - # 3. we could ask on rabbit whether someone would like to keep that machine for something (like the agent for example), if that is the case, we wait another hour and ask again? - # 4. async def _notify_machine_creation_progress(app: FastAPI, cluster: Cluster) -> None: From 8ba745734d1f3a0652540e6c099b9673693f0b56 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 15 Jan 2024 18:43:50 +0100 Subject: [PATCH 54/86] progress and logs now better --- .../modules/auto_scaling_core.py | 26 +++++++++++++------ .../auto_scaling_mode_computational.py | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index be47e444306..63b7147ee9f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -806,26 +806,36 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: ) -async def _notify_machine_creation_progress(app: FastAPI, cluster: Cluster) -> None: +async def _notify_machine_creation_progress( + app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling +) -> None: app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) + time_since_launch_to_tasks = collections.defaultdict(list) now = datetime.datetime.now(datetime.timezone.utc) - for instance in cluster.pending_ec2s: + for instance in cluster.pending_nodes + cluster.pending_ec2s: time_since_launch = now - instance.ec2_instance.launch_time estimated_time_to_completion = ( instance.ec2_instance.launch_time + instance_max_time_to_start - now ) - _logger.info( - "LOG: %s", + time_since_launch_to_tasks[time_since_launch] += instance.assigned_tasks + + for time_since_launch, tasks in time_since_launch_to_tasks.items(): + msg = ( f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}," - f" est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait...", + f" est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait..." ) - _logger.info( - "PROGRESS: %s", - f"{time_since_launch.total_seconds() / instance_max_time_to_start.total_seconds():.2f}", + await auto_scaling_mode.log_message_from_tasks( + app, tasks, message=msg, level=logging.INFO + ) + await auto_scaling_mode.progress_message_from_tasks( + app, + tasks, + progress=time_since_launch.total_seconds() + / instance_max_time_to_start.total_seconds(), ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 433139e343c..f4ff24a5c08 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -90,7 +90,7 @@ async def log_message_from_tasks( async def progress_message_from_tasks(app: FastAPI, tasks: list, progress: float): assert app # nosec assert tasks # nosec - _logger.info("PROGRESS: %s", progress) + _logger.info("PROGRESS: %s", f"{progress:.2f}") @staticmethod def get_task_required_resources(task) -> Resources: From ec899dc276385f97cca2df956a6e37dd9effb15e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 09:17:00 +0100 Subject: [PATCH 55/86] improved notification mechanism --- .../modules/auto_scaling_core.py | 64 +++++++++++++------ .../auto_scaling_mode_computational.py | 4 +- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 63b7147ee9f..c988616bd3a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -806,37 +806,59 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster: ) -async def _notify_machine_creation_progress( - app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling +async def _notify_based_on_machine_type( + app: FastAPI, + instances: list[AssociatedInstance] | list[NonAssociatedInstance], + auto_scaling_mode: BaseAutoscaling, + *, + message: str, ) -> None: app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) - time_since_launch_to_tasks = collections.defaultdict(list) + launch_time_to_tasks = collections.defaultdict(list) now = datetime.datetime.now(datetime.timezone.utc) - for instance in cluster.pending_nodes + cluster.pending_ec2s: - time_since_launch = now - instance.ec2_instance.launch_time - estimated_time_to_completion = ( - instance.ec2_instance.launch_time + instance_max_time_to_start - now - ) - time_since_launch_to_tasks[time_since_launch] += instance.assigned_tasks + for instance in instances: + launch_time_to_tasks[ + instance.ec2_instance.launch_time + ] += instance.assigned_tasks - for time_since_launch, tasks in time_since_launch_to_tasks.items(): + for launch_time, tasks in launch_time_to_tasks.items(): + time_since_launch = now - launch_time + estimated_time_to_completion = launch_time + instance_max_time_to_start - now msg = ( - f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}," + f"{message} (time waiting: {timedelta_as_minute_second(time_since_launch)}," f" est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait..." ) - await auto_scaling_mode.log_message_from_tasks( - app, tasks, message=msg, level=logging.INFO - ) - await auto_scaling_mode.progress_message_from_tasks( - app, - tasks, - progress=time_since_launch.total_seconds() - / instance_max_time_to_start.total_seconds(), - ) + if tasks: + await auto_scaling_mode.log_message_from_tasks( + app, tasks, message=msg, level=logging.INFO + ) + await auto_scaling_mode.progress_message_from_tasks( + app, + tasks, + progress=time_since_launch.total_seconds() + / instance_max_time_to_start.total_seconds(), + ) + + +async def _notify_machine_creation_progress( + app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling +) -> None: + await _notify_based_on_machine_type( + app, + cluster.pending_nodes, + auto_scaling_mode, + message="machine joined cluster! waiting for connection", + ) + await _notify_based_on_machine_type( + app, + cluster.pending_ec2s, + auto_scaling_mode, + message="waiting for machine to join cluster", + ) async def _autoscale_cluster( @@ -926,5 +948,5 @@ async def auto_scale_cluster( cluster = await _try_attach_pending_ec2s(app, cluster, auto_scaling_mode) cluster = await _autoscale_cluster(app, cluster, auto_scaling_mode) - await _notify_machine_creation_progress(app, cluster) + await _notify_machine_creation_progress(app, cluster, auto_scaling_mode) await _notify_autoscaling_status(app, cluster, auto_scaling_mode) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index f4ff24a5c08..2feeb12bdbe 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -83,13 +83,13 @@ async def log_message_from_tasks( app: FastAPI, tasks: list, message: str, *, level: LogLevelInt ) -> None: assert app # nosec - assert tasks # nosec + assert tasks is not None # nosec _logger.log(level, "LOG: %s", message) @staticmethod async def progress_message_from_tasks(app: FastAPI, tasks: list, progress: float): assert app # nosec - assert tasks # nosec + assert tasks is not None # nosec _logger.info("PROGRESS: %s", f"{progress:.2f}") @staticmethod From c4b897b9a1c90571ebdc192f09c1071535f89bf8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:07:20 +0100 Subject: [PATCH 56/86] reduce log pollution fixed computation of new instance types --- .../modules/auto_scaling_core.py | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index c988616bd3a..eb0e032fe72 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -290,12 +290,13 @@ def _try_assign_task_to_ec2_instance( ): continue if instance.has_resources_for_task(task_required_resources): - _logger.info( + instance.assign_task(task, task_required_resources) + _logger.debug( "%s", - f"assigning task with {task_required_resources=}, {task_required_ec2_instance=} to " - f"{instance.ec2_instance.id=},{instance.ec2_instance.type}: {instance.available_resources}/{instance.ec2_instance.resources}", + f"assigned task with {task_required_resources=}, {task_required_ec2_instance=} to " + f"{instance.ec2_instance.id=}:{instance.ec2_instance.type}, " + f"remaining resources:{instance.available_resources}/{instance.ec2_instance.resources}", ) - instance.assign_task(task, task_required_resources) return True return False @@ -313,12 +314,13 @@ def _try_assign_task_to_ec2_instance_type( ): continue if instance.has_resources_for_task(task_required_resources): - _logger.info( + instance.assign_task(task, task_required_resources) + _logger.debug( "%s", - f"assigning task with {task_required_resources=}, {task_required_ec2_instance=} to " - f"{instance.instance_type}: {instance.available_resources}/{instance.instance_type.resources}", + f"assigned task with {task_required_resources=}, {task_required_ec2_instance=} to " + f"{instance.instance_type}, " + f"remaining resources:{instance.available_resources}/{instance.instance_type.resources}", ) - instance.assign_task(task, task_required_resources) return True return False @@ -367,7 +369,7 @@ async def _assign_tasks_to_current_cluster( assignment(task, task_required_ec2_instance, task_required_resources) for assignment in assignment_functions ): - _logger.info("assigned task to cluster") + _logger.debug("assigned task to cluster") else: unassigned_tasks.append(task) @@ -421,7 +423,8 @@ async def _find_needed_instances( AssignedTasksToInstanceType( instance_type=defined_ec2, assigned_tasks=[task], - available_resources=defined_ec2.resources, + available_resources=defined_ec2.resources + - task_required_resources, ) ) else: @@ -435,7 +438,8 @@ async def _find_needed_instances( AssignedTasksToInstanceType( instance_type=best_ec2_instance, assigned_tasks=[task], - available_resources=best_ec2_instance.resources, + available_resources=best_ec2_instance.resources + - task_required_resources, ) ) except Ec2InstanceNotFoundError: @@ -447,6 +451,14 @@ async def _find_needed_instances( except Ec2InstanceInvalidError: _logger.exception("Unexpected error:") + _logger.info( + "found following needed instances: %s", + [ + f"{i.instance_type.name=}:{i.instance_type.resources} with {len(i.assigned_tasks)} tasks" + for i in needed_new_instance_types_for_tasks + ], + ) + num_instances_per_type = collections.defaultdict( int, collections.Counter( @@ -679,19 +691,13 @@ async def _scale_up_cluster( return cluster -async def _deactivate_empty_nodes( - app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling -) -> Cluster: +async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster: docker_client = get_docker_client(app) active_empty_instances: list[AssociatedInstance] = [] active_non_empty_instances: list[AssociatedInstance] = [] for instance in cluster.active_nodes: try: - node_used_resources = await auto_scaling_mode.compute_node_used_resources( - app, - instance, - ) - if node_used_resources == Resources.create_as_empty(): + if instance.available_resources == instance.ec2_instance.resources: active_empty_instances.append(instance) else: active_non_empty_instances.append(instance) @@ -903,7 +909,7 @@ async def _autoscale_cluster( ) # NOTE: we only scale down in case we did not just scale up. The swarm needs some time to adjust await auto_scaling_mode.try_retire_nodes(app) - cluster = await _deactivate_empty_nodes(app, cluster, auto_scaling_mode) + cluster = await _deactivate_empty_nodes(app, cluster) cluster = await _try_scale_down_cluster(app, cluster) return cluster From 84aa9e5ff368c231c8d773289ddd421a52b83c84 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:07:29 +0100 Subject: [PATCH 57/86] syntax --- packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py b/packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py index c3af235c220..54019faca11 100644 --- a/packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py +++ b/packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py @@ -75,7 +75,7 @@ async def dask_spec_local_cluster( scheduler_address = URL(cluster.scheduler_address) monkeypatch.setenv( "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL", - f"{scheduler_address}" or "invalid", + f"{scheduler_address or 'invalid'}", ) yield cluster @@ -95,7 +95,7 @@ async def dask_local_cluster_without_workers( scheduler_address = URL(cluster.scheduler_address) monkeypatch.setenv( "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL", - f"{scheduler_address}" or "invalid", + f"{scheduler_address or 'invalid'}", ) yield cluster From dd047a6caad5dac8e1df98379bad0a0ae1fd0c7a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:07:38 +0100 Subject: [PATCH 58/86] syntax --- services/autoscaling/tests/unit/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index c13c9be2f16..ca0f36c640a 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -595,6 +595,7 @@ def _creator(**cluter_overrides) -> Cluster: return dataclasses.replace( Cluster( active_nodes=[], + pending_nodes=[], drained_nodes=[], reserve_drained_nodes=[], pending_ec2s=[], From 80e9f0963eb742e1d131a8a4750bc38ff0058904 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:07:53 +0100 Subject: [PATCH 59/86] fixed tests --- ...test_modules_auto_scaling_computational.py | 95 ++++--------------- 1 file changed, 19 insertions(+), 76 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index f909caab521..641d2792321 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -12,7 +12,6 @@ import logging from collections import defaultdict from collections.abc import Callable, Iterator -from copy import deepcopy from dataclasses import dataclass from typing import Any from unittest import mock @@ -34,15 +33,8 @@ from pytest_mock import MockerFixture from pytest_simcore.helpers.utils_envs import EnvVarsDict, setenvs_from_dict from simcore_service_autoscaling.core.settings import ApplicationSettings -from simcore_service_autoscaling.models import ( - AssociatedInstance, - Cluster, - EC2InstanceData, -) -from simcore_service_autoscaling.modules.auto_scaling_core import ( - _deactivate_empty_nodes, - auto_scale_cluster, -) +from simcore_service_autoscaling.models import EC2InstanceData +from simcore_service_autoscaling.modules.auto_scaling_core import auto_scale_cluster from simcore_service_autoscaling.modules.auto_scaling_mode_computational import ( ComputationalAutoscaling, ) @@ -312,6 +304,15 @@ def mock_dask_get_worker_used_resources(mocker: MockerFixture) -> mock.Mock: ) +@pytest.fixture +def mock_dask_is_worker_connected(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_autoscaling.modules.dask.is_worker_connected", + return_value=True, + autospec=True, + ) + + async def _create_task_with_resources( ec2_client: EC2Client, dask_task_imposed_ec2_type: InstanceTypeType | None, @@ -384,6 +385,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_docker_compute_node_used_resources: mock.Mock, mock_dask_get_worker_has_results_in_memory: mock.Mock, mock_dask_get_worker_used_resources: mock.Mock, + mock_dask_is_worker_connected: mock.Mock, mocker: MockerFixture, dask_spec_local_cluster: distributed.SpecCluster, create_dask_task_resources: Callable[..., DaskTaskResources], @@ -425,6 +427,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_docker_compute_node_used_resources.assert_not_called() mock_dask_get_worker_has_results_in_memory.assert_not_called() mock_dask_get_worker_used_resources.assert_not_called() + mock_dask_is_worker_connected.assert_not_called() # check rabbit messages were sent _assert_rabbit_autoscaling_message_sent( mock_rabbitmq_post_message, @@ -444,6 +447,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_dask_get_worker_has_results_in_memory.reset_mock() mock_dask_get_worker_used_resources.assert_called_once() mock_dask_get_worker_used_resources.reset_mock() + mock_dask_is_worker_connected.assert_not_called() internal_dns_names = await _assert_ec2_instances( ec2_client, num_reservations=1, @@ -494,6 +498,9 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 await auto_scale_cluster( app=initialized_app, auto_scaling_mode=auto_scaling_mode ) + mock_dask_is_worker_connected.assert_called() + assert mock_dask_is_worker_connected.call_count == num_useless_calls + mock_dask_is_worker_connected.reset_mock() mock_dask_get_worker_has_results_in_memory.assert_called() assert ( mock_dask_get_worker_has_results_in_memory.call_count == 2 * num_useless_calls @@ -525,6 +532,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 del dask_future await auto_scale_cluster(app=initialized_app, auto_scaling_mode=auto_scaling_mode) + mock_dask_is_worker_connected.assert_called_once() + mock_dask_is_worker_connected.reset_mock() mock_dask_get_worker_has_results_in_memory.assert_called() assert mock_dask_get_worker_has_results_in_memory.call_count == 2 mock_dask_get_worker_has_results_in_memory.reset_mock() @@ -925,69 +934,3 @@ async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_star assert len(all_instances["Reservations"]) == len( aws_allowed_ec2_instance_type_names ) - - -@pytest.fixture -def fake_associated_host_instance( - host_node: DockerNode, - fake_localhost_ec2_instance_data: EC2InstanceData, -) -> AssociatedInstance: - return AssociatedInstance( - host_node, - fake_localhost_ec2_instance_data, - ) - - -async def test__deactivate_empty_nodes( - minimal_configuration: None, - initialized_app: FastAPI, - cluster: Callable[..., Cluster], - host_node: DockerNode, - fake_associated_host_instance: AssociatedInstance, - mock_docker_set_node_availability: mock.Mock, -): - # since we have no service running, we expect the passed node to be set to drain - active_cluster = cluster(active_nodes=[fake_associated_host_instance]) - updated_cluster = await _deactivate_empty_nodes( - initialized_app, active_cluster, ComputationalAutoscaling() - ) - assert not updated_cluster.active_nodes - assert len(updated_cluster.drained_nodes) == len(active_cluster.active_nodes) - mock_docker_set_node_availability.assert_called_once_with( - mock.ANY, host_node, available=False - ) - - -async def test__deactivate_empty_nodes_with_finished_tasks_should_not_deactivate_until_tasks_are_retrieved( - minimal_configuration: None, - initialized_app: FastAPI, - cluster: Callable[..., Cluster], - host_node: DockerNode, - fake_associated_host_instance: AssociatedInstance, - mock_docker_set_node_availability: mock.Mock, - create_dask_task: Callable[[DaskTaskResources], distributed.Future], -): - dask_future = create_dask_task({}) - assert dask_future - # NOTE: this sucks, but it seems that as soon as we use any method of the future it returns the data to the caller - await asyncio.sleep(4) - # since we have result still in memory, the node shall remain active - active_cluster = cluster(active_nodes=[fake_associated_host_instance]) - - updated_cluster = await _deactivate_empty_nodes( - initialized_app, deepcopy(active_cluster), ComputationalAutoscaling() - ) - assert updated_cluster.active_nodes - mock_docker_set_node_availability.assert_not_called() - - # now removing the dask_future shall remove the result from the memory - del dask_future - await asyncio.sleep(4) - updated_cluster = await _deactivate_empty_nodes( - initialized_app, deepcopy(active_cluster), ComputationalAutoscaling() - ) - assert not updated_cluster.active_nodes - assert len(updated_cluster.drained_nodes) == len(active_cluster.active_nodes) - mock_docker_set_node_availability.assert_called_once_with( - mock.ANY, host_node, available=False - ) From 523d3c2a5ce3726dcb96b3f43754fc45be1b9238 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:29:20 +0100 Subject: [PATCH 60/86] fixed comparison --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index eb0e032fe72..02ac637d9fc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -310,7 +310,7 @@ def _try_assign_task_to_ec2_instance_type( ) -> bool: for instance in instances: if task_required_ec2_instance and ( - task_required_ec2_instance != instance.instance_type + task_required_ec2_instance != instance.instance_type.name ): continue if instance.has_resources_for_task(task_required_resources): From 4dcb929ebd2de2e517f91cf2a5268644d0dae05c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:29:40 +0100 Subject: [PATCH 61/86] fixed confusing errors while running --- .../unit/test_modules_auto_scaling_dynamic.py | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index ef1d31de6f8..079b4e132e2 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -22,6 +22,7 @@ from models_library.docker import ( DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY, DockerLabelKey, + StandardSimcoreDockerLabels, ) from models_library.generated_models.docker_rest_api import ( Availability, @@ -370,7 +371,7 @@ async def _assert_ec2_instances( assert "Instances" in reservation assert ( len(reservation["Instances"]) == num_instances - ), f"created {num_instances} instances of {reservation['Instances'][0]['InstanceType'] if num_instances > 0 else 'n/a'}" + ), f"expected {num_instances}, found {len(reservation['Instances'])}" for instance in reservation["Instances"]: assert "InstanceType" in instance assert instance["InstanceType"] == instance_type @@ -721,6 +722,7 @@ class _ScaleUpParams: async def test_cluster_scaling_up_starts_multiple_instances( minimal_configuration: None, service_monitored_labels: dict[DockerLabelKey, str], + osparc_docker_label_keys: StandardSimcoreDockerLabels, app_settings: ApplicationSettings, initialized_app: FastAPI, create_service: Callable[ @@ -748,7 +750,8 @@ async def test_cluster_scaling_up_starts_multiple_instances( int(scale_up_params.service_resources.cpus), scale_up_params.service_resources.ram, ), - service_monitored_labels, + service_monitored_labels + | osparc_docker_label_keys.to_simcore_runtime_docker_labels(), "pending", [ f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params.imposed_instance_type}" @@ -798,11 +801,11 @@ async def test__deactivate_empty_nodes( ): # since we have no service running, we expect the passed node to be set to drain active_cluster = cluster( - active_nodes=[AssociatedInstance(host_node, fake_ec2_instance_data())] - ) - updated_cluster = await _deactivate_empty_nodes( - initialized_app, active_cluster, DynamicAutoscaling() + active_nodes=[ + AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + ] ) + updated_cluster = await _deactivate_empty_nodes(initialized_app, active_cluster) assert not updated_cluster.active_nodes assert len(updated_cluster.drained_nodes) == len(active_cluster.active_nodes) mock_docker_set_node_availability.assert_called_once_with( @@ -834,11 +837,11 @@ async def test__deactivate_empty_nodes_to_drain_when_services_running_are_missin "running", ) active_cluster = cluster( - active_nodes=[AssociatedInstance(host_node, fake_ec2_instance_data())] - ) - updated_cluster = await _deactivate_empty_nodes( - initialized_app, active_cluster, DynamicAutoscaling() + active_nodes=[ + AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + ] ) + updated_cluster = await _deactivate_empty_nodes(initialized_app, active_cluster) assert not updated_cluster.active_nodes assert len(updated_cluster.drained_nodes) == len(active_cluster.active_nodes) mock_docker_set_node_availability.assert_called_once_with( @@ -875,11 +878,11 @@ async def test__deactivate_empty_nodes_does_not_drain_if_service_is_running_with # since we have no service running, we expect the passed node to be set to drain active_cluster = cluster( - active_nodes=[AssociatedInstance(host_node, fake_ec2_instance_data())] - ) - updated_cluster = await _deactivate_empty_nodes( - initialized_app, active_cluster, DynamicAutoscaling() + active_nodes=[ + AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + ] ) + updated_cluster = await _deactivate_empty_nodes(initialized_app, active_cluster) assert updated_cluster == active_cluster mock_docker_set_node_availability.assert_not_called() @@ -893,9 +896,13 @@ async def test__find_terminateable_nodes_with_no_hosts( ): # there is no node to terminate here since nothing is drained active_cluster = cluster( - active_nodes=[AssociatedInstance(host_node, fake_ec2_instance_data())], + active_nodes=[ + AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + ], drained_nodes=[], - reserve_drained_nodes=[AssociatedInstance(host_node, fake_ec2_instance_data())], + reserve_drained_nodes=[ + AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + ], ) assert await _find_terminateable_instances(initialized_app, active_cluster) == [] @@ -921,8 +928,8 @@ def _creator(node: Node, terminateable_time: bool) -> AssociatedInstance: else datetime.timedelta(seconds=10) ) return AssociatedInstance( - node, - fake_ec2_instance_data( + node=node, + ec2_instance=fake_ec2_instance_data( launch_time=datetime.datetime.now(datetime.timezone.utc) - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION - datetime.timedelta( @@ -974,7 +981,7 @@ async def test__activate_drained_nodes_with_no_tasks( # no tasks, does nothing and returns True empty_cluster = cluster() still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, empty_cluster, [], DynamicAutoscaling() + initialized_app, empty_cluster, [] ) assert not still_pending_tasks assert updated_cluster == empty_cluster @@ -989,7 +996,7 @@ async def test__activate_drained_nodes_with_no_tasks( ], ) still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, active_cluster, [], DynamicAutoscaling() + initialized_app, active_cluster, [] ) assert not still_pending_tasks assert updated_cluster == active_cluster @@ -1032,10 +1039,7 @@ async def test__activate_drained_nodes_with_no_drained_nodes( active_nodes=[create_associated_instance(host_node, True)] # noqa: FBT003 ) still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, - cluster_without_drained_nodes, - service_tasks, - DynamicAutoscaling(), + initialized_app, cluster_without_drained_nodes, service_tasks ) assert still_pending_tasks == service_tasks assert updated_cluster == cluster_without_drained_nodes @@ -1082,7 +1086,7 @@ async def test__activate_drained_nodes_with_drained_node( ) still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, cluster_with_drained_nodes, service_tasks, DynamicAutoscaling() + initialized_app, cluster_with_drained_nodes, service_tasks ) assert not still_pending_tasks assert updated_cluster.active_nodes == cluster_with_drained_nodes.drained_nodes From e46890b4af4cad2ade4dc870276f6cde53741c94 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 13:10:37 +0100 Subject: [PATCH 62/86] only upgrade available resources if not set in constructor --- .../autoscaling/src/simcore_service_autoscaling/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index aa6745b7e8f..ad8e871c341 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -41,7 +41,10 @@ class _BaseInstance: _available_resources: Resources = field(default_factory=Resources.create_as_empty) def __post_init__(self) -> None: - object.__setattr__(self, "_available_resources", self.ec2_instance.resources) + if self._available_resources == Resources.create_as_empty(): + object.__setattr__( + self, "_available_resources", self.ec2_instance.resources + ) def has_resources_for_task(self, task_resources: Resources) -> bool: return bool(self._available_resources >= task_resources) From dc29dd7d8b739dcd7be39bbe530464b41f5c8ee4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 13:11:07 +0100 Subject: [PATCH 63/86] tests are fixed here --- .../autoscaling/tests/unit/test_modules_dask.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 24a679773ba..4863f277777 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -9,9 +9,11 @@ import distributed import pytest +from arrow import utcnow from aws_library.ec2.models import Resources from faker import Faker from pydantic import AnyUrl, ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_host import get_localhost_ip from simcore_service_autoscaling.core.errors import ( DaskNoWorkersError, DaskSchedulerNotFoundError, @@ -56,6 +58,7 @@ def dask_workers_config() -> dict[str, Any]: "options": { "nthreads": 2, "resources": {"CPU": 2, "RAM": 48e9}, + "name": f"dask-sidecar_ip-{get_localhost_ip().replace('.', '-')}_{utcnow()}", }, } } @@ -107,21 +110,23 @@ def _add_fct(x: int, y: int) -> int: return x + y # there is nothing now - assert await list_processing_tasks_per_worker(url=scheduler_url) == [] + assert await list_processing_tasks_per_worker(url=scheduler_url) == {} # this function will be queued and executed as there are no specific resources needed future_queued_task = dask_spec_cluster_client.submit(_add_fct, 2, 5) assert future_queued_task - assert await list_processing_tasks_per_worker(scheduler_url) == [ - DaskTaskId(future_queued_task.key) - ] + assert await list_processing_tasks_per_worker(scheduler_url) == { + next(iter(dask_spec_cluster_client.scheduler_info()["workers"])): [ + DaskTask(task_id=DaskTaskId(future_queued_task.key), required_resources={}) + ] + } result = await future_queued_task.result(timeout=_REMOTE_FCT_SLEEP_TIME_S + 4) # type: ignore assert result == 7 # nothing processing anymore - assert await list_processing_tasks_per_worker(url=scheduler_url) == [] + assert await list_processing_tasks_per_worker(url=scheduler_url) == {} _DASK_SCHEDULER_REACTION_TIME_S: Final[int] = 4 From 81e36cdd1d1ad3a7f9b3b4803f009c73eab9a3de Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 13:11:19 +0100 Subject: [PATCH 64/86] fixed tests --- .../unit/test_modules_auto_scaling_dynamic.py | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 079b4e132e2..38f157b53f0 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -877,9 +877,31 @@ async def test__deactivate_empty_nodes_does_not_drain_if_service_is_running_with ) # since we have no service running, we expect the passed node to be set to drain + assert host_node.Description + assert host_node.Description.Resources + assert host_node.Description.Resources.NanoCPUs + host_node_resources = Resources.parse_obj( + { + "ram": host_node.Description.Resources.MemoryBytes, + "cpus": host_node.Description.Resources.NanoCPUs / 10**9, + } + ) + fake_ec2_instance = fake_ec2_instance_data(resources=host_node_resources) + fake_associated_instance = AssociatedInstance( + node=host_node, ec2_instance=fake_ec2_instance + ) + node_used_resources = await DynamicAutoscaling().compute_node_used_resources( + initialized_app, fake_associated_instance + ) + assert node_used_resources + active_cluster = cluster( active_nodes=[ - AssociatedInstance(node=host_node, ec2_instance=fake_ec2_instance_data()) + AssociatedInstance( + node=host_node, + ec2_instance=fake_ec2_instance, + _available_resources=host_node_resources - node_used_resources, + ) ] ) updated_cluster = await _deactivate_empty_nodes(initialized_app, active_cluster) @@ -980,10 +1002,9 @@ async def test__activate_drained_nodes_with_no_tasks( ): # no tasks, does nothing and returns True empty_cluster = cluster() - still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, empty_cluster, [] + updated_cluster = await _activate_drained_nodes( + initialized_app, empty_cluster, DynamicAutoscaling() ) - assert not still_pending_tasks assert updated_cluster == empty_cluster active_cluster = cluster( @@ -995,10 +1016,9 @@ async def test__activate_drained_nodes_with_no_tasks( create_associated_instance(drained_host_node, True) # noqa: FBT003 ], ) - still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, active_cluster, [] + updated_cluster = await _activate_drained_nodes( + initialized_app, active_cluster, DynamicAutoscaling() ) - assert not still_pending_tasks assert updated_cluster == active_cluster mock_tag_node.assert_not_called() @@ -1038,10 +1058,9 @@ async def test__activate_drained_nodes_with_no_drained_nodes( cluster_without_drained_nodes = cluster( active_nodes=[create_associated_instance(host_node, True)] # noqa: FBT003 ) - still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, cluster_without_drained_nodes, service_tasks + updated_cluster = await _activate_drained_nodes( + initialized_app, cluster_without_drained_nodes, DynamicAutoscaling() ) - assert still_pending_tasks == service_tasks assert updated_cluster == cluster_without_drained_nodes mock_tag_node.assert_not_called() @@ -1084,11 +1103,13 @@ async def test__activate_drained_nodes_with_drained_node( create_associated_instance(drained_host_node, True) # noqa: FBT003 ] ) + cluster_with_drained_nodes.drained_nodes[0].assign_task( + service_tasks[0], Resources(cpus=int(host_cpu_count / 2 + 1), ram=ByteSize(0)) + ) - still_pending_tasks, updated_cluster = await _activate_drained_nodes( - initialized_app, cluster_with_drained_nodes, service_tasks + updated_cluster = await _activate_drained_nodes( + initialized_app, cluster_with_drained_nodes, DynamicAutoscaling() ) - assert not still_pending_tasks assert updated_cluster.active_nodes == cluster_with_drained_nodes.drained_nodes assert drained_host_node.Spec mock_tag_node.assert_called_once_with( From 61bafbec0d6cecd9be1b4635af30ad97e86412c5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 13:35:59 +0100 Subject: [PATCH 65/86] fixed tests --- services/autoscaling/tests/unit/test_utils_ec2.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/services/autoscaling/tests/unit/test_utils_ec2.py b/services/autoscaling/tests/unit/test_utils_ec2.py index 49b06586cc8..575e17958ad 100644 --- a/services/autoscaling/tests/unit/test_utils_ec2.py +++ b/services/autoscaling/tests/unit/test_utils_ec2.py @@ -77,10 +77,7 @@ async def test_find_best_fitting_ec2_instance_closest_instance_policy( score_type=closest_instance_policy, ) - SKIPPED_KEYS = ["name"] - for k in found_instance.__dict__: - if k not in SKIPPED_KEYS: - assert getattr(found_instance, k) == getattr(expected_ec2_instance, k) + assert found_instance.resources == expected_ec2_instance.resources def test_compose_user_data(faker: Faker): From eb14d4b1037eef80e739ddb277773041390ce865 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:08:41 +0100 Subject: [PATCH 66/86] mypy --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 02ac637d9fc..1656aabfcd9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -824,7 +824,7 @@ async def _notify_based_on_machine_type( instance_max_time_to_start = ( app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME ) - launch_time_to_tasks = collections.defaultdict(list) + launch_time_to_tasks: dict[datetime.datetime, list] = collections.defaultdict(list) now = datetime.datetime.now(datetime.timezone.utc) for instance in instances: launch_time_to_tasks[ From ebf85f63d21cfe1c9eac50990d8bb016f7a1e762 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:15:05 +0100 Subject: [PATCH 67/86] removed unused code --- .../utils/dynamic_scaling.py | 59 ------------ .../tests/unit/test_utils_dynamic_scaling.py | 96 ------------------- 2 files changed, 155 deletions(-) delete mode 100644 services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py delete mode 100644 services/autoscaling/tests/unit/test_utils_dynamic_scaling.py diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py deleted file mode 100644 index a5f1c56fea3..00000000000 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/dynamic_scaling.py +++ /dev/null @@ -1,59 +0,0 @@ -import datetime -import logging - -from fastapi import FastAPI -from models_library.generated_models.docker_rest_api import Task -from servicelib.utils_formatting import timedelta_as_minute_second - -from ..core.settings import get_application_settings -from ..models import AssignedTasksToInstance -from . import utils_docker -from .rabbitmq import log_tasks_message, progress_tasks_message - -logger = logging.getLogger(__name__) - - -# NOTE: DEPRECATED module!!! kept until the logging is moved to auto_scaling_core.py - - -async def try_assigning_task_to_instances( - app: FastAPI, - task: Task, - instances_to_tasks: list[AssignedTasksToInstance], - *, - notify_progress: bool, -) -> bool: - app_settings = get_application_settings(app) - assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec - instance_max_time_to_start = ( - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME - ) - task_required_resources = utils_docker.get_max_resources_from_docker_task(task) - for assigned_tasks_to_instance in instances_to_tasks: - if assigned_tasks_to_instance.has_resources_for_task(task_required_resources): - assigned_tasks_to_instance.assign_task(task, task_required_resources) - if notify_progress: - now = datetime.datetime.now(datetime.timezone.utc) - time_since_launch = ( - now - assigned_tasks_to_instance.instance.launch_time - ) - estimated_time_to_completion = ( - assigned_tasks_to_instance.instance.launch_time - + instance_max_time_to_start - - now - ) - - await log_tasks_message( - app, - [task], - f"adding machines to the cluster (time waiting: {timedelta_as_minute_second(time_since_launch)}, " - f"est. remaining time: {timedelta_as_minute_second(estimated_time_to_completion)})...please wait...", - ) - await progress_tasks_message( - app, - [task], - time_since_launch.total_seconds() - / instance_max_time_to_start.total_seconds(), - ) - return True - return False diff --git a/services/autoscaling/tests/unit/test_utils_dynamic_scaling.py b/services/autoscaling/tests/unit/test_utils_dynamic_scaling.py deleted file mode 100644 index ab3b3c55e3f..00000000000 --- a/services/autoscaling/tests/unit/test_utils_dynamic_scaling.py +++ /dev/null @@ -1,96 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=unused-variable -# pylint: disable=too-many-arguments - - -from collections.abc import Callable -from datetime import timedelta - -import pytest -from aws_library.ec2.models import EC2InstanceData, Resources -from faker import Faker -from models_library.generated_models.docker_rest_api import Task -from pydantic import ByteSize -from pytest_mock import MockerFixture -from simcore_service_autoscaling.models import AssignedTasksToInstance -from simcore_service_autoscaling.utils.dynamic_scaling import ( - try_assigning_task_to_instances, -) - - -@pytest.fixture -def fake_task(faker: Faker) -> Callable[..., Task]: - def _creator(**overrides) -> Task: - return Task( - **({"ID": faker.uuid4(), "Name": faker.pystr(), "Spec": {}} | overrides) - ) - - return _creator - - -async def test_try_assigning_task_to_instances_with_no_instances( - mocker: MockerFixture, - fake_task: Callable[..., Task], -): - fake_app = mocker.Mock() - pending_task = fake_task() - assert ( - await try_assigning_task_to_instances( - fake_app, pending_task, [], notify_progress=True - ) - is False - ) - - -async def test_try_assigning_task_to_instances( - mocker: MockerFixture, - fake_task: Callable[..., Task], - fake_ec2_instance_data: Callable[..., EC2InstanceData], -): - fake_app = mocker.Mock() - fake_app.state.settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_START_TIME = ( - timedelta(minutes=1) - ) - pending_task = fake_task( - Spec={"Resources": {"Reservations": {"NanoCPUs": 2 * 1e9}}} - ) - fake_instance = fake_ec2_instance_data() - pending_instance_to_tasks: list[AssignedTasksToInstance] = [ - AssignedTasksToInstance( - instance=fake_instance, - assigned_tasks=[], - available_resources=Resources(cpus=4, ram=ByteSize(1024**3)), - ) - ] - - # calling once should allow to add that task to the instance - assert ( - await try_assigning_task_to_instances( - fake_app, - pending_task, - pending_instance_to_tasks, - notify_progress=True, - ) - is True - ) - # calling a second time as well should allow to add that task to the instance - assert ( - await try_assigning_task_to_instances( - fake_app, - pending_task, - pending_instance_to_tasks, - notify_progress=True, - ) - is True - ) - # calling a third time should fail - assert ( - await try_assigning_task_to_instances( - fake_app, - pending_task, - pending_instance_to_tasks, - notify_progress=True, - ) - is False - ) From 0e18cf189fcbd5b246c66094704d96200ba6e5d6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:25:37 +0100 Subject: [PATCH 68/86] remove catastrophic backtracking --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- services/autoscaling/tests/unit/test_modules_dask.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 53f78937a97..c7618713d95 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -54,7 +54,7 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] DASK_NAME_PATTERN: Final[re.Pattern] = re.compile( - r"^.+_(?Pip-\d+-\d+-\d+-\d+).+$" + r"^(.+_)?(?Pip-\d+-\d+-\d+-\d+)_.+$" ) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4863f277777..4b30887cf0e 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -53,7 +53,7 @@ def scheduler_url(dask_spec_local_cluster: distributed.SpecCluster) -> AnyUrl: def dask_workers_config() -> dict[str, Any]: # NOTE: override of pytest-simcore dask_workers_config to have only 1 worker return { - "single-cpu-worker": { + "single-cpu_worker": { "cls": distributed.Worker, "options": { "nthreads": 2, From 0441def562423bb255bde71dbf83045f5b81aa22 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:32:27 +0100 Subject: [PATCH 69/86] remove catastrophic backtracking --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index c7618713d95..7149cd2fa9b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -54,7 +54,7 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] DASK_NAME_PATTERN: Final[re.Pattern] = re.compile( - r"^(.+_)?(?Pip-\d+-\d+-\d+-\d+)_.+$" + r"^(.+_)?(?Pip-(?:\d{1,3}-){3}\d{1,3})_.*$" ) From cd99cd1aeca7b19c9054c370a77106727b9bf21f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 13:58:59 +0000 Subject: [PATCH 70/86] also check pending nodes do not send message for pending nodes --- .../modules/auto_scaling_core.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 1656aabfcd9..f1c2301af63 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -695,7 +695,7 @@ async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster: docker_client = get_docker_client(app) active_empty_instances: list[AssociatedInstance] = [] active_non_empty_instances: list[AssociatedInstance] = [] - for instance in cluster.active_nodes: + for instance in cluster.active_nodes + cluster.pending_nodes: try: if instance.available_resources == instance.ec2_instance.resources: active_empty_instances.append(instance) @@ -853,12 +853,6 @@ async def _notify_based_on_machine_type( async def _notify_machine_creation_progress( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling ) -> None: - await _notify_based_on_machine_type( - app, - cluster.pending_nodes, - auto_scaling_mode, - message="machine joined cluster! waiting for connection", - ) await _notify_based_on_machine_type( app, cluster.pending_ec2s, From a73f7b7e3666beec0d4bfc2356c8d38b3d00fb39 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 18:13:10 +0100 Subject: [PATCH 71/86] necessary for docker >24 --- .../src/simcore_service_autoscaling/utils/utils_docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index 3f0743cfdd8..d5d8d4f2cfe 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -426,7 +426,7 @@ def get_docker_pull_images_on_start_bash_command( write_docker_compose_pull_script_cmd = " ".join( [ "echo", - f'"#!/bin/sh\necho Pulling started at \\$(date)\n{_DOCKER_COMPOSE_CMD} --file={_PRE_PULL_COMPOSE_PATH} pull --ignore-pull-failures"', + f'"#!/bin/sh\necho Pulling started at \\$(date)\n{_DOCKER_COMPOSE_CMD} --project-name=autoscaleprepull --file={_PRE_PULL_COMPOSE_PATH} pull --ignore-pull-failures"', ">", f"{_DOCKER_COMPOSE_PULL_SCRIPT_PATH}", ] From 4b6f7873b17e6fd4b2ebce622012b32f7e1ebd42 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:49:35 +0100 Subject: [PATCH 72/86] remove too many logs --- .../modules/auto_scaling_core.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index f1c2301af63..ad2da6a72ff 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -669,19 +669,9 @@ async def _scale_up_cluster( "service is pending due to missing resources, scaling up cluster now...", level=logging.INFO, ) - # NOTE: notify the up-scaling progress started... - await auto_scaling_mode.progress_message_from_tasks( - app, unassigned_tasks, 0.001 - ) new_pending_instances = await _start_instances( app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode ) - await auto_scaling_mode.log_message_from_tasks( - app, - unassigned_tasks, - f"{len(new_pending_instances)} new machines being started, please wait...", - level=logging.INFO, - ) cluster.pending_ec2s.extend( [NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances] ) From 78e2869d504a7393df2ac78de0d5f0c5d9fe3dd9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:49:58 +0100 Subject: [PATCH 73/86] only deactivate active nodes --- .../modules/auto_scaling_core.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index ad2da6a72ff..1e2cd2ff2c6 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -23,7 +23,6 @@ from types_aiobotocore_ec2.literals import InstanceTypeType from ..core.errors import ( - DaskWorkerNotFoundError, Ec2InstanceInvalidError, Ec2InstanceNotFoundError, Ec2InvalidDnsNameError, @@ -113,7 +112,7 @@ def _node_not_ready(node: Node) -> bool: ) _logger.debug( "current state: %s", - f"{json.dumps(jsonable_encoder(cluster), indent=2)}", + f"{json.dumps(jsonable_encoder(cluster, include={'active_nodes', 'pending_nodes', 'drained_nodes', 'reserve_drained_nodes', 'pending_ec2s'}), indent=2)}", ) return cluster @@ -685,16 +684,12 @@ async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster: docker_client = get_docker_client(app) active_empty_instances: list[AssociatedInstance] = [] active_non_empty_instances: list[AssociatedInstance] = [] - for instance in cluster.active_nodes + cluster.pending_nodes: - try: - if instance.available_resources == instance.ec2_instance.resources: - active_empty_instances.append(instance) - else: - active_non_empty_instances.append(instance) - except DaskWorkerNotFoundError: # noqa: PERF203 - _logger.exception( - "EC2 node instance is not registered to dask-scheduler! TIP: Needs investigation" - ) + for instance in cluster.active_nodes: + if instance.available_resources == instance.ec2_instance.resources: + active_empty_instances.append(instance) + else: + active_non_empty_instances.append(instance) + if not active_empty_instances: return cluster _logger.info( From 3c8d3ed2b1341ce8a0728668a934da4d053c095e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:50:18 +0100 Subject: [PATCH 74/86] fixed regex --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 7149cd2fa9b..e47530a41b7 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -54,7 +54,7 @@ async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: DaskWorkerUrl: TypeAlias = str DaskWorkerDetails: TypeAlias = dict[str, Any] DASK_NAME_PATTERN: Final[re.Pattern] = re.compile( - r"^(.+_)?(?Pip-(?:\d{1,3}-){3}\d{1,3})_.*$" + r"^(?P.+)_(?Pip-\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3})[-_].*$" ) From e98b0e6c5caa5b7cfc1ecd50b4889ea41c95070a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 22:08:25 +0100 Subject: [PATCH 75/86] fixed test --- services/autoscaling/tests/unit/test_utils_docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/tests/unit/test_utils_docker.py b/services/autoscaling/tests/unit/test_utils_docker.py index 7d3dc9cef95..fc0f3d90e8f 100644 --- a/services/autoscaling/tests/unit/test_utils_docker.py +++ b/services/autoscaling/tests/unit/test_utils_docker.py @@ -899,7 +899,7 @@ async def test_tag_node_out_of_sequence_error( 'image: itisfoundation/simcore/services/dynamic/service:23.5.5\nversion: \'"3.8"\'\n"' " > /docker-pull.compose.yml" " && " - 'echo "#!/bin/sh\necho Pulling started at \\$(date)\ndocker compose --file=/docker-pull.compose.yml pull --ignore-pull-failures" > /docker-pull-script.sh' + 'echo "#!/bin/sh\necho Pulling started at \\$(date)\ndocker compose --project-name=autoscaleprepull --file=/docker-pull.compose.yml pull --ignore-pull-failures" > /docker-pull-script.sh' " && " "chmod +x /docker-pull-script.sh" " && " From d6ef83594081d0c66c7dcd30f5c813df3b131a16 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 16 Jan 2024 22:14:55 +0100 Subject: [PATCH 76/86] clean up --- .../src/simcore_service_autoscaling/models.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index ad8e871c341..833831574fc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -6,20 +6,6 @@ from models_library.generated_models.docker_rest_api import Node -@dataclass(kw_only=True, slots=True) -class AssignedTasksToInstance: - instance: EC2InstanceData - available_resources: Resources - assigned_tasks: list - - def has_resources_for_task(self, task_resources: Resources) -> bool: - return bool(self.available_resources >= task_resources) - - def assign_task(self, task, task_resources: Resources) -> None: - self.assigned_tasks.append(task) - self.available_resources -= task_resources - - @dataclass(kw_only=True, slots=True) class AssignedTasksToInstanceType: instance_type: EC2InstanceType From d6d7b189fbe65a949a3d8952b282324f6c928d6c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 07:55:43 +0100 Subject: [PATCH 77/86] @pcrespov review: rename fct --- packages/aws-library/src/aws_library/ec2/models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/models.py b/packages/aws-library/src/aws_library/ec2/models.py index 1765a325a0e..79c73e89fae 100644 --- a/packages/aws-library/src/aws_library/ec2/models.py +++ b/packages/aws-library/src/aws_library/ec2/models.py @@ -54,10 +54,8 @@ def __sub__(self, other: "Resources") -> "Resources": @validator("cpus", pre=True) @classmethod - def ensure_negative_is_0(cls, v: float) -> float: - if v < 0: - return 0 - return v + def _floor_cpus_to_0(cls, v: float) -> float: + return max(v, 0) @dataclass(frozen=True, kw_only=True, slots=True) From 019d0bb05333776df830481fd0550f2a92d7e1bb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:07:52 +0100 Subject: [PATCH 78/86] @pcrespov review: improve code --- .../modules/dask.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index e47530a41b7..2df79c2fb15 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -191,29 +191,26 @@ async def get_worker_used_resources( """ def _get_worker_used_resources( - dask_scheduler: distributed.Scheduler, - ) -> dict[str, dict]: - used_resources: dict[str, dict] = {} + dask_scheduler: distributed.Scheduler, *, worker_url: str + ) -> dict[str, float] | None: for worker_name, worker_state in dask_scheduler.workers.items(): + if worker_url != worker_name: + continue if worker_state.status is distributed.Status.closing_gracefully: # NOTE: when a worker was retired it is in this state - used_resources[worker_name] = {} - else: - used_resources[worker_name] = worker_state.used_resources - return used_resources + return {} + return worker_state.used_resources + return None async with _scheduler_client(url) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) # now get the used resources - used_resources_per_worker: dict[ - str, dict[str, Any] - ] = await _wrap_client_async_routine( + worker_used_resources: dict[str, Any] | None = await _wrap_client_async_routine( client.run_on_scheduler(_get_worker_used_resources) ) - if worker_url not in used_resources_per_worker: + if worker_used_resources is None: raise DaskWorkerNotFoundError(worker_host=worker_url, url=url) - worker_used_resources = used_resources_per_worker[worker_url] return Resources( cpus=worker_used_resources.get("CPU", 0), ram=parse_obj_as(ByteSize, worker_used_resources.get("RAM", 0)), From c703f6f6c5ccdd191af61b1161bf0338565ea243 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:16:39 +0100 Subject: [PATCH 79/86] @pcrespov review: refactor --- .../src/simcore_service_autoscaling/modules/dask.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 2df79c2fb15..530036e36a1 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -138,7 +138,7 @@ async def list_processing_tasks_per_worker( DaskSchedulerNotFoundError """ - def _list_tasks( + def _list_processing_tasks( dask_scheduler: distributed.Scheduler, ) -> dict[str, list[tuple[DaskTaskId, DaskTaskResources]]]: worker_to_processing_tasks = defaultdict(list) @@ -152,13 +152,14 @@ def _list_tasks( async with _scheduler_client(url) as client: worker_to_tasks: dict[ str, list[tuple[DaskTaskId, DaskTaskResources]] - ] = await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) + ] = await _wrap_client_async_routine( + client.run_on_scheduler(_list_processing_tasks) + ) _logger.debug("found processing tasks: %s", worker_to_tasks) - tasks_per_worker = {} + tasks_per_worker = defaultdict(list) for worker, tasks in worker_to_tasks.items(): - tasks_per_worker[worker] = [ - DaskTask(task_id=t[0], required_resources=t[1]) for t in tasks - ] + for task_id, required_resources in tasks: + tasks_per_worker[worker].append(DaskTask(task_id, required_resources)) return tasks_per_worker From d23e109fe4fcd8abb96c1a24c5568b8cf05bdac4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:25:49 +0100 Subject: [PATCH 80/86] @pcrespov review: remove unused fct --- .../utils/computational_scaling.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py index 6f61f69a2f2..de13a16f49b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/computational_scaling.py @@ -26,10 +26,3 @@ def get_task_instance_restriction(task: DaskTask) -> str | None: task.required_resources ) return instance_ec2_type - - -def _compute_tasks_resources(tasks: list[DaskTask]) -> Resources: - return sum( - (resources_from_dask_task(t) for t in tasks), - Resources.create_as_empty(), - ) From b337597a80978e15a11462c29b957345eea4b060 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:33:05 +0100 Subject: [PATCH 81/86] @pcrespov review: add asserts to convey message --- .../src/simcore_service_autoscaling/utils/utils_ec2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py index d0ab38f352d..23475994622 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_ec2.py @@ -80,6 +80,8 @@ def closest_instance_policy( return 0 # compute a score for all the instances that are above expectations # best is the exact ec2 instance + assert ec2_instance.resources.cpus > 0 # nosec + assert ec2_instance.resources.ram > 0 # nosec cpu_ratio = float(ec2_instance.resources.cpus - resources.cpus) / float( ec2_instance.resources.cpus ) From 40a4e25216bbbe14108607882deaed75e41fa83c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:39:58 +0100 Subject: [PATCH 82/86] @pcrespov review: refactor --- .../src/simcore_service_autoscaling/models.py | 45 +++++++------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 833831574fc..dc2b575e2ee 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -6,44 +6,33 @@ from models_library.generated_models.docker_rest_api import Node -@dataclass(kw_only=True, slots=True) -class AssignedTasksToInstanceType: - instance_type: EC2InstanceType - assigned_tasks: list +@dataclass(frozen=True, slots=True, kw_only=True) +class TaskAssignmentMixin: + assigned_tasks: list = field(default_factory=list) available_resources: Resources - def has_resources_for_task(self, task_resources: Resources) -> bool: - return bool(self.available_resources >= task_resources) - def assign_task(self, task, task_resources: Resources) -> None: self.assigned_tasks.append(task) - self.available_resources -= task_resources + object.__setattr__( + self, "available_resources", self.available_resources - task_resources + ) + def has_resources_for_task(self, task_resources: Resources) -> bool: + return self.available_resources >= task_resources -@dataclass(frozen=True, kw_only=True, slots=True) -class _BaseInstance: - ec2_instance: EC2InstanceData - assigned_tasks: list = field(default_factory=list) - _available_resources: Resources = field(default_factory=Resources.create_as_empty) - def __post_init__(self) -> None: - if self._available_resources == Resources.create_as_empty(): - object.__setattr__( - self, "_available_resources", self.ec2_instance.resources - ) +@dataclass(frozen=True, kw_only=True, slots=True) +class AssignedTasksToInstanceType(TaskAssignmentMixin): + instance_type: EC2InstanceType - def has_resources_for_task(self, task_resources: Resources) -> bool: - return bool(self._available_resources >= task_resources) - def assign_task(self, task, task_resources: Resources) -> None: - self.assigned_tasks.append(task) - object.__setattr__( - self, "_available_resources", self._available_resources - task_resources - ) +@dataclass(frozen=True, kw_only=True, slots=True) +class _BaseInstance(TaskAssignmentMixin): + ec2_instance: EC2InstanceData - @property - def available_resources(self) -> Resources: - return self._available_resources + def __post_init__(self) -> None: + if self.available_resources == Resources.create_as_empty(): + object.__setattr__(self, "available_resources", self.ec2_instance.resources) @dataclass(frozen=True, kw_only=True, slots=True) From 678742021a03fc5ef34333efdb4fdcc75c357ccd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:45:56 +0100 Subject: [PATCH 83/86] some typos --- .../src/simcore_service_autoscaling/models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index dc2b575e2ee..f87398de0f9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -7,9 +7,9 @@ @dataclass(frozen=True, slots=True, kw_only=True) -class TaskAssignmentMixin: +class _TaskAssignmentMixin: assigned_tasks: list = field(default_factory=list) - available_resources: Resources + available_resources: Resources = field(default_factory=Resources.create_as_empty) def assign_task(self, task, task_resources: Resources) -> None: self.assigned_tasks.append(task) @@ -18,16 +18,16 @@ def assign_task(self, task, task_resources: Resources) -> None: ) def has_resources_for_task(self, task_resources: Resources) -> bool: - return self.available_resources >= task_resources + return bool(self.available_resources >= task_resources) @dataclass(frozen=True, kw_only=True, slots=True) -class AssignedTasksToInstanceType(TaskAssignmentMixin): +class AssignedTasksToInstanceType(_TaskAssignmentMixin): instance_type: EC2InstanceType @dataclass(frozen=True, kw_only=True, slots=True) -class _BaseInstance(TaskAssignmentMixin): +class _BaseInstance(_TaskAssignmentMixin): ec2_instance: EC2InstanceData def __post_init__(self) -> None: From a178756393907bd5e286f88ae1cb503f2d5391db Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:47:21 +0100 Subject: [PATCH 84/86] mypy --- .../src/simcore_service_autoscaling/modules/dask.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 530036e36a1..04f02313920 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -159,7 +159,9 @@ def _list_processing_tasks( tasks_per_worker = defaultdict(list) for worker, tasks in worker_to_tasks.items(): for task_id, required_resources in tasks: - tasks_per_worker[worker].append(DaskTask(task_id, required_resources)) + tasks_per_worker[worker].append( + DaskTask(task_id=task_id, required_resources=required_resources) + ) return tasks_per_worker @@ -200,7 +202,7 @@ def _get_worker_used_resources( if worker_state.status is distributed.Status.closing_gracefully: # NOTE: when a worker was retired it is in this state return {} - return worker_state.used_resources + return dict(worker_state.used_resources) return None async with _scheduler_client(url) as client: From a1dc532f4573ff51af04580ebab7259dfc3f0807 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:48:02 +0100 Subject: [PATCH 85/86] adapt to new naming --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 2 +- .../autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 1e2cd2ff2c6..63a31cfa195 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -88,7 +88,7 @@ def _node_not_ready(node: Node) -> bool: active_nodes.append( dataclasses.replace( instance, - _available_resources=instance.ec2_instance.resources + available_resources=instance.ec2_instance.resources - node_used_resources, ) ) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 38f157b53f0..6b11152ab09 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -900,7 +900,7 @@ async def test__deactivate_empty_nodes_does_not_drain_if_service_is_running_with AssociatedInstance( node=host_node, ec2_instance=fake_ec2_instance, - _available_resources=host_node_resources - node_used_resources, + available_resources=host_node_resources - node_used_resources, ) ] ) From 5eb5c36a716e3cd103f6f35f4b2474450db64a93 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 17 Jan 2024 08:52:30 +0100 Subject: [PATCH 86/86] fix calls to dask_scheduler stuff --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 04f02313920..47a5488b244 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -210,7 +210,7 @@ def _get_worker_used_resources( # now get the used resources worker_used_resources: dict[str, Any] | None = await _wrap_client_async_routine( - client.run_on_scheduler(_get_worker_used_resources) + client.run_on_scheduler(_get_worker_used_resources, worker_url=worker_url), ) if worker_used_resources is None: raise DaskWorkerNotFoundError(worker_host=worker_url, url=url)