Skip to content

Commit 09866f1

Browse files
authored
✨Clusters-keeper: terminate broken EC2s🚨 (#5851)
1 parent b9022f6 commit 09866f1

File tree

12 files changed

+360
-100
lines changed

12 files changed

+360
-100
lines changed

packages/aws-library/src/aws_library/ec2/client.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import contextlib
22
import logging
3+
from collections.abc import Iterable
34
from dataclasses import dataclass
45
from typing import cast
56

@@ -281,7 +282,9 @@ async def get_instances(
281282
)
282283
return all_instances
283284

284-
async def terminate_instances(self, instance_datas: list[EC2InstanceData]) -> None:
285+
async def terminate_instances(
286+
self, instance_datas: Iterable[EC2InstanceData]
287+
) -> None:
285288
try:
286289
with log_context(
287290
_logger,

packages/aws-library/src/aws_library/ec2/models.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,28 @@ class AWSTagValue(ConstrainedStr):
8484
@dataclass(frozen=True)
8585
class EC2InstanceData:
8686
launch_time: datetime.datetime
87-
id: str # noqa: A003
87+
id: str
8888
aws_private_dns: InstancePrivateDNSName
8989
aws_public_ip: str | None
90-
type: InstanceTypeType # noqa: A003
90+
type: InstanceTypeType
9191
state: InstanceStateNameType
9292
resources: Resources
9393
tags: EC2Tags
9494

95+
def __hash__(self) -> int:
96+
return hash(
97+
(
98+
self.launch_time,
99+
self.id,
100+
self.aws_private_dns,
101+
self.aws_public_ip,
102+
self.type,
103+
self.state,
104+
self.resources,
105+
tuple(sorted(self.tags.items())),
106+
)
107+
)
108+
95109

96110
@dataclass(frozen=True)
97111
class EC2InstanceConfig:

packages/aws-library/tests/test_ec2_models.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55

66
import pytest
7-
from aws_library.ec2.models import AWSTagKey, AWSTagValue, Resources
7+
from aws_library.ec2.models import AWSTagKey, AWSTagValue, EC2InstanceData, Resources
8+
from faker import Faker
89
from pydantic import ByteSize, ValidationError, parse_obj_as
910

1011

@@ -132,3 +133,40 @@ def test_aws_tag_key_invalid(ec2_tag_key: str):
132133

133134
# for a value it does not
134135
parse_obj_as(AWSTagValue, ec2_tag_key)
136+
137+
138+
def test_ec2_instance_data_hashable(faker: Faker):
139+
first_set_of_ec2s = {
140+
EC2InstanceData(
141+
faker.date_time(),
142+
faker.pystr(),
143+
faker.pystr(),
144+
f"{faker.ipv4()}",
145+
"g4dn.xlarge",
146+
"running",
147+
Resources(
148+
cpus=faker.pyfloat(min_value=0.1),
149+
ram=ByteSize(faker.pyint(min_value=123)),
150+
),
151+
{AWSTagKey("mytagkey"): AWSTagValue("mytagvalue")},
152+
)
153+
}
154+
second_set_of_ec2s = {
155+
EC2InstanceData(
156+
faker.date_time(),
157+
faker.pystr(),
158+
faker.pystr(),
159+
f"{faker.ipv4()}",
160+
"g4dn.xlarge",
161+
"running",
162+
Resources(
163+
cpus=faker.pyfloat(min_value=0.1),
164+
ram=ByteSize(faker.pyint(min_value=123)),
165+
),
166+
{AWSTagKey("mytagkey"): AWSTagValue("mytagvalue")},
167+
)
168+
}
169+
170+
union_of_sets = first_set_of_ec2s.union(second_set_of_ec2s)
171+
assert next(iter(first_set_of_ec2s)) in union_of_sets
172+
assert next(iter(second_set_of_ec2s)) in union_of_sets

services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,8 @@ def check_valid_instance_names(
115115
) -> dict[str, EC2InstanceBootSpecific]:
116116
# NOTE: needed because of a flaw in BaseCustomSettings
117117
# issubclass raises TypeError if used on Aliases
118-
if all(parse_obj_as(InstanceTypeType, key) for key in value):
119-
return value
120-
121-
msg = "Invalid instance type name"
122-
raise ValueError(msg)
118+
parse_obj_as(list[InstanceTypeType], list(value))
119+
return value
123120

124121

125122
class PrimaryEC2InstancesSettings(BaseCustomSettings):
@@ -177,18 +174,23 @@ class PrimaryEC2InstancesSettings(BaseCustomSettings):
177174
..., description="Password for accessing prometheus data"
178175
)
179176

177+
PRIMARY_EC2_INSTANCES_MAX_START_TIME: datetime.timedelta = Field(
178+
default=datetime.timedelta(minutes=2),
179+
description="Usual time taken an EC2 instance with the given AMI takes to startup and be ready to receive jobs "
180+
"(default to seconds, or see https://pydantic-docs.helpmanual.io/usage/types/#datetime-types for string formating)."
181+
"NOTE: be careful that this time should always be a factor larger than the real time, as EC2 instances"
182+
"that take longer than this time will be terminated as sometimes it happens that EC2 machine fail on start.",
183+
)
184+
180185
@validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES")
181186
@classmethod
182187
def check_valid_instance_names(
183188
cls, value: dict[str, EC2InstanceBootSpecific]
184189
) -> dict[str, EC2InstanceBootSpecific]:
185190
# NOTE: needed because of a flaw in BaseCustomSettings
186191
# issubclass raises TypeError if used on Aliases
187-
if all(parse_obj_as(InstanceTypeType, key) for key in value):
188-
return value
189-
190-
msg = "Invalid instance type name"
191-
raise ValueError(msg)
192+
parse_obj_as(list[InstanceTypeType], list(value))
193+
return value
192194

193195
@validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES")
194196
@classmethod

services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import datetime
21
import logging
2+
from collections.abc import Iterable
33

4+
import arrow
45
from aws_library.ec2.client import SimcoreEC2API
56
from aws_library.ec2.models import (
67
AWSTagKey,
@@ -96,15 +97,17 @@ async def create_cluster(
9697
return new_ec2_instance_data
9798

9899

99-
async def get_all_clusters(app: FastAPI) -> list[EC2InstanceData]:
100+
async def get_all_clusters(app: FastAPI) -> set[EC2InstanceData]:
100101
app_settings = get_application_settings(app)
101102
assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec
102-
ec2_instance_data: list[EC2InstanceData] = await get_ec2_client(app).get_instances(
103-
key_names=[
104-
app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME
105-
],
106-
tags=all_created_ec2_instances_filter(app_settings),
107-
state_names=["running"],
103+
ec2_instance_data: set[EC2InstanceData] = set(
104+
await get_ec2_client(app).get_instances(
105+
key_names=[
106+
app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME
107+
],
108+
tags=all_created_ec2_instances_filter(app_settings),
109+
state_names=["running"],
110+
)
108111
)
109112
return ec2_instance_data
110113

@@ -159,9 +162,11 @@ async def set_instance_heartbeat(app: FastAPI, *, instance: EC2InstanceData) ->
159162
ec2_client = get_ec2_client(app)
160163
await ec2_client.set_instances_tags(
161164
[instance],
162-
tags={HEARTBEAT_TAG_KEY: f"{datetime.datetime.now(datetime.timezone.utc)}"},
165+
tags={HEARTBEAT_TAG_KEY: AWSTagValue(arrow.utcnow().datetime.isoformat())},
163166
)
164167

165168

166-
async def delete_clusters(app: FastAPI, *, instances: list[EC2InstanceData]) -> None:
169+
async def delete_clusters(
170+
app: FastAPI, *, instances: Iterable[EC2InstanceData]
171+
) -> None:
167172
await get_ec2_client(app).terminate_instances(instances)
Lines changed: 108 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import datetime
22
import logging
3+
from collections.abc import Iterable
4+
from typing import Final
35

46
import arrow
5-
from aws_library.ec2.models import EC2InstanceData
7+
from aws_library.ec2.models import AWSTagKey, EC2InstanceData
68
from fastapi import FastAPI
79
from models_library.users import UserID
810
from models_library.wallets import WalletID
11+
from pydantic import parse_obj_as
12+
from servicelib.logging_utils import log_catch
913

1014
from ..core.settings import get_application_settings
1115
from ..modules.clusters import (
@@ -21,16 +25,44 @@
2125
_logger = logging.getLogger(__name__)
2226

2327

24-
def _get_instance_last_heartbeat(instance: EC2InstanceData) -> datetime.datetime:
25-
if last_heartbeat := instance.tags.get(HEARTBEAT_TAG_KEY, None):
28+
def _get_instance_last_heartbeat(instance: EC2InstanceData) -> datetime.datetime | None:
29+
if last_heartbeat := instance.tags.get(
30+
HEARTBEAT_TAG_KEY,
31+
):
2632
last_heartbeat_time: datetime.datetime = arrow.get(last_heartbeat).datetime
2733
return last_heartbeat_time
28-
launch_time: datetime.datetime = instance.launch_time
29-
return launch_time
34+
35+
return None
36+
37+
38+
_USER_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "user_id")
39+
_WALLET_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "wallet_id")
40+
41+
42+
async def _get_all_associated_worker_instances(
43+
app: FastAPI,
44+
primary_instances: Iterable[EC2InstanceData],
45+
) -> list[EC2InstanceData]:
46+
worker_instances = []
47+
for instance in primary_instances:
48+
assert "user_id" in instance.tags # nosec
49+
user_id = UserID(instance.tags[_USER_ID_TAG_KEY])
50+
assert "wallet_id" in instance.tags # nosec
51+
# NOTE: wallet_id can be None
52+
wallet_id = (
53+
WalletID(instance.tags[_WALLET_ID_TAG_KEY])
54+
if instance.tags[_WALLET_ID_TAG_KEY] != "None"
55+
else None
56+
)
57+
58+
worker_instances.extend(
59+
await get_cluster_workers(app, user_id=user_id, wallet_id=wallet_id)
60+
)
61+
return worker_instances
3062

3163

3264
async def _find_terminateable_instances(
33-
app: FastAPI, instances: list[EC2InstanceData]
65+
app: FastAPI, instances: Iterable[EC2InstanceData]
3466
) -> list[EC2InstanceData]:
3567
app_settings = get_application_settings(app)
3668
assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec
@@ -42,61 +74,89 @@ async def _find_terminateable_instances(
4274
app_settings.CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION
4375
* app_settings.SERVICE_TRACKING_HEARTBEAT
4476
)
77+
startup_delay = (
78+
app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_MAX_START_TIME
79+
)
4580
for instance in instances:
46-
last_heartbeat = _get_instance_last_heartbeat(instance)
47-
48-
elapsed_time_since_heartbeat = (
49-
datetime.datetime.now(datetime.timezone.utc) - last_heartbeat
50-
)
51-
_logger.info(
52-
"%s has still %ss before being terminateable",
53-
f"{instance.id=}",
54-
f"{(time_to_wait_before_termination - elapsed_time_since_heartbeat).total_seconds()}",
55-
)
56-
if elapsed_time_since_heartbeat >= time_to_wait_before_termination:
57-
# let's terminate that one
58-
terminateable_instances.append(instance)
81+
if last_heartbeat := _get_instance_last_heartbeat(instance):
82+
elapsed_time_since_heartbeat = arrow.utcnow().datetime - last_heartbeat
83+
allowed_time_to_wait = time_to_wait_before_termination
84+
if elapsed_time_since_heartbeat >= allowed_time_to_wait:
85+
terminateable_instances.append(instance)
86+
else:
87+
_logger.info(
88+
"%s has still %ss before being terminateable",
89+
f"{instance.id=}",
90+
f"{(allowed_time_to_wait - elapsed_time_since_heartbeat).total_seconds()}",
91+
)
92+
else:
93+
elapsed_time_since_startup = arrow.utcnow().datetime - instance.launch_time
94+
allowed_time_to_wait = startup_delay
95+
if elapsed_time_since_startup >= allowed_time_to_wait:
96+
terminateable_instances.append(instance)
5997

6098
# get all terminateable instances associated worker instances
61-
worker_instances = []
62-
for instance in terminateable_instances:
63-
assert "user_id" in instance.tags # nosec
64-
user_id = UserID(instance.tags["user_id"])
65-
assert "wallet_id" in instance.tags # nosec
66-
# NOTE: wallet_id can be None
67-
wallet_id = (
68-
WalletID(instance.tags["wallet_id"])
69-
if instance.tags["wallet_id"] != "None"
70-
else None
71-
)
72-
73-
worker_instances.extend(
74-
await get_cluster_workers(app, user_id=user_id, wallet_id=wallet_id)
75-
)
99+
worker_instances = await _get_all_associated_worker_instances(
100+
app, terminateable_instances
101+
)
76102

77103
return terminateable_instances + worker_instances
78104

79105

80106
async def check_clusters(app: FastAPI) -> None:
107+
primary_instances = await get_all_clusters(app)
81108

82-
instances = await get_all_clusters(app)
83-
connected_intances = [
109+
connected_intances = {
84110
instance
85-
for instance in instances
111+
for instance in primary_instances
86112
if await ping_scheduler(get_scheduler_url(instance), get_scheduler_auth(app))
87-
]
113+
}
114+
88115
for instance in connected_intances:
89-
is_busy = await is_scheduler_busy(
90-
get_scheduler_url(instance), get_scheduler_auth(app)
91-
)
92-
_logger.info(
93-
"%s currently %s",
94-
f"{instance.id=} for {instance.tags=}",
95-
f"{'is running tasks' if is_busy else 'not doing anything!'}",
96-
)
97-
if is_busy:
98-
await set_instance_heartbeat(app, instance=instance)
116+
with log_catch(_logger, reraise=False):
117+
# NOTE: some connected instance could in theory break between these 2 calls, therefore this is silenced and will
118+
# be handled in the next call to check_clusters
119+
if await is_scheduler_busy(
120+
get_scheduler_url(instance), get_scheduler_auth(app)
121+
):
122+
_logger.info(
123+
"%s is running tasks",
124+
f"{instance.id=} for {instance.tags=}",
125+
)
126+
await set_instance_heartbeat(app, instance=instance)
99127
if terminateable_instances := await _find_terminateable_instances(
100128
app, connected_intances
101129
):
102130
await delete_clusters(app, instances=terminateable_instances)
131+
132+
# analyse disconnected instances (currently starting or broken)
133+
disconnected_instances = primary_instances - connected_intances
134+
135+
# starting instances do not have a heartbeat set but sometimes might fail and should be terminated
136+
starting_instances = {
137+
instance
138+
for instance in disconnected_instances
139+
if _get_instance_last_heartbeat(instance) is None
140+
}
141+
142+
if terminateable_instances := await _find_terminateable_instances(
143+
app, starting_instances
144+
):
145+
_logger.warning(
146+
"The following clusters'primary EC2 were starting for too long and will be terminated now "
147+
"(either because a cluster was started and is not needed anymore, or there is an issue): '%s",
148+
f"{[i.id for i in terminateable_instances]}",
149+
)
150+
await delete_clusters(app, instances=terminateable_instances)
151+
152+
# the other instances are broken (they were at some point connected but now not anymore)
153+
broken_instances = disconnected_instances - starting_instances
154+
if terminateable_instances := await _find_terminateable_instances(
155+
app, broken_instances
156+
):
157+
_logger.error(
158+
"The following clusters'primary EC2 were found as unresponsive "
159+
"(TIP: there is something wrong here, please inform support) and will be terminated now: '%s",
160+
f"{[i.id for i in terminateable_instances]}",
161+
)
162+
await delete_clusters(app, instances=terminateable_instances)

0 commit comments

Comments
 (0)