1
1
import datetime
2
2
import logging
3
+ from collections .abc import Iterable
4
+ from typing import Final
3
5
4
6
import arrow
5
- from aws_library .ec2 .models import EC2InstanceData
7
+ from aws_library .ec2 .models import AWSTagKey , EC2InstanceData
6
8
from fastapi import FastAPI
7
9
from models_library .users import UserID
8
10
from models_library .wallets import WalletID
11
+ from pydantic import parse_obj_as
12
+ from servicelib .logging_utils import log_catch
9
13
10
14
from ..core .settings import get_application_settings
11
15
from ..modules .clusters import (
21
25
_logger = logging .getLogger (__name__ )
22
26
23
27
24
- def _get_instance_last_heartbeat (instance : EC2InstanceData ) -> datetime .datetime :
25
- if last_heartbeat := instance .tags .get (HEARTBEAT_TAG_KEY , None ):
28
+ def _get_instance_last_heartbeat (instance : EC2InstanceData ) -> datetime .datetime | None :
29
+ if last_heartbeat := instance .tags .get (
30
+ HEARTBEAT_TAG_KEY ,
31
+ ):
26
32
last_heartbeat_time : datetime .datetime = arrow .get (last_heartbeat ).datetime
27
33
return last_heartbeat_time
28
- launch_time : datetime .datetime = instance .launch_time
29
- return launch_time
34
+
35
+ return None
36
+
37
+
38
+ _USER_ID_TAG_KEY : Final [AWSTagKey ] = parse_obj_as (AWSTagKey , "user_id" )
39
+ _WALLET_ID_TAG_KEY : Final [AWSTagKey ] = parse_obj_as (AWSTagKey , "wallet_id" )
40
+
41
+
42
+ async def _get_all_associated_worker_instances (
43
+ app : FastAPI ,
44
+ primary_instances : Iterable [EC2InstanceData ],
45
+ ) -> list [EC2InstanceData ]:
46
+ worker_instances = []
47
+ for instance in primary_instances :
48
+ assert "user_id" in instance .tags # nosec
49
+ user_id = UserID (instance .tags [_USER_ID_TAG_KEY ])
50
+ assert "wallet_id" in instance .tags # nosec
51
+ # NOTE: wallet_id can be None
52
+ wallet_id = (
53
+ WalletID (instance .tags [_WALLET_ID_TAG_KEY ])
54
+ if instance .tags [_WALLET_ID_TAG_KEY ] != "None"
55
+ else None
56
+ )
57
+
58
+ worker_instances .extend (
59
+ await get_cluster_workers (app , user_id = user_id , wallet_id = wallet_id )
60
+ )
61
+ return worker_instances
30
62
31
63
32
64
async def _find_terminateable_instances (
33
- app : FastAPI , instances : list [EC2InstanceData ]
65
+ app : FastAPI , instances : Iterable [EC2InstanceData ]
34
66
) -> list [EC2InstanceData ]:
35
67
app_settings = get_application_settings (app )
36
68
assert app_settings .CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec
@@ -42,61 +74,89 @@ async def _find_terminateable_instances(
42
74
app_settings .CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION
43
75
* app_settings .SERVICE_TRACKING_HEARTBEAT
44
76
)
77
+ startup_delay = (
78
+ app_settings .CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES .PRIMARY_EC2_INSTANCES_MAX_START_TIME
79
+ )
45
80
for instance in instances :
46
- last_heartbeat = _get_instance_last_heartbeat (instance )
47
-
48
- elapsed_time_since_heartbeat = (
49
- datetime .datetime .now (datetime .timezone .utc ) - last_heartbeat
50
- )
51
- _logger .info (
52
- "%s has still %ss before being terminateable" ,
53
- f"{ instance .id = } " ,
54
- f"{ (time_to_wait_before_termination - elapsed_time_since_heartbeat ).total_seconds ()} " ,
55
- )
56
- if elapsed_time_since_heartbeat >= time_to_wait_before_termination :
57
- # let's terminate that one
58
- terminateable_instances .append (instance )
81
+ if last_heartbeat := _get_instance_last_heartbeat (instance ):
82
+ elapsed_time_since_heartbeat = arrow .utcnow ().datetime - last_heartbeat
83
+ allowed_time_to_wait = time_to_wait_before_termination
84
+ if elapsed_time_since_heartbeat >= allowed_time_to_wait :
85
+ terminateable_instances .append (instance )
86
+ else :
87
+ _logger .info (
88
+ "%s has still %ss before being terminateable" ,
89
+ f"{ instance .id = } " ,
90
+ f"{ (allowed_time_to_wait - elapsed_time_since_heartbeat ).total_seconds ()} " ,
91
+ )
92
+ else :
93
+ elapsed_time_since_startup = arrow .utcnow ().datetime - instance .launch_time
94
+ allowed_time_to_wait = startup_delay
95
+ if elapsed_time_since_startup >= allowed_time_to_wait :
96
+ terminateable_instances .append (instance )
59
97
60
98
# get all terminateable instances associated worker instances
61
- worker_instances = []
62
- for instance in terminateable_instances :
63
- assert "user_id" in instance .tags # nosec
64
- user_id = UserID (instance .tags ["user_id" ])
65
- assert "wallet_id" in instance .tags # nosec
66
- # NOTE: wallet_id can be None
67
- wallet_id = (
68
- WalletID (instance .tags ["wallet_id" ])
69
- if instance .tags ["wallet_id" ] != "None"
70
- else None
71
- )
72
-
73
- worker_instances .extend (
74
- await get_cluster_workers (app , user_id = user_id , wallet_id = wallet_id )
75
- )
99
+ worker_instances = await _get_all_associated_worker_instances (
100
+ app , terminateable_instances
101
+ )
76
102
77
103
return terminateable_instances + worker_instances
78
104
79
105
80
106
async def check_clusters (app : FastAPI ) -> None :
107
+ primary_instances = await get_all_clusters (app )
81
108
82
- instances = await get_all_clusters (app )
83
- connected_intances = [
109
+ connected_intances = {
84
110
instance
85
- for instance in instances
111
+ for instance in primary_instances
86
112
if await ping_scheduler (get_scheduler_url (instance ), get_scheduler_auth (app ))
87
- ]
113
+ }
114
+
88
115
for instance in connected_intances :
89
- is_busy = await is_scheduler_busy (
90
- get_scheduler_url (instance ), get_scheduler_auth (app )
91
- )
92
- _logger .info (
93
- "%s currently %s" ,
94
- f"{ instance .id = } for { instance .tags = } " ,
95
- f"{ 'is running tasks' if is_busy else 'not doing anything!' } " ,
96
- )
97
- if is_busy :
98
- await set_instance_heartbeat (app , instance = instance )
116
+ with log_catch (_logger , reraise = False ):
117
+ # NOTE: some connected instance could in theory break between these 2 calls, therefore this is silenced and will
118
+ # be handled in the next call to check_clusters
119
+ if await is_scheduler_busy (
120
+ get_scheduler_url (instance ), get_scheduler_auth (app )
121
+ ):
122
+ _logger .info (
123
+ "%s is running tasks" ,
124
+ f"{ instance .id = } for { instance .tags = } " ,
125
+ )
126
+ await set_instance_heartbeat (app , instance = instance )
99
127
if terminateable_instances := await _find_terminateable_instances (
100
128
app , connected_intances
101
129
):
102
130
await delete_clusters (app , instances = terminateable_instances )
131
+
132
+ # analyse disconnected instances (currently starting or broken)
133
+ disconnected_instances = primary_instances - connected_intances
134
+
135
+ # starting instances do not have a heartbeat set but sometimes might fail and should be terminated
136
+ starting_instances = {
137
+ instance
138
+ for instance in disconnected_instances
139
+ if _get_instance_last_heartbeat (instance ) is None
140
+ }
141
+
142
+ if terminateable_instances := await _find_terminateable_instances (
143
+ app , starting_instances
144
+ ):
145
+ _logger .warning (
146
+ "The following clusters'primary EC2 were starting for too long and will be terminated now "
147
+ "(either because a cluster was started and is not needed anymore, or there is an issue): '%s" ,
148
+ f"{ [i .id for i in terminateable_instances ]} " ,
149
+ )
150
+ await delete_clusters (app , instances = terminateable_instances )
151
+
152
+ # the other instances are broken (they were at some point connected but now not anymore)
153
+ broken_instances = disconnected_instances - starting_instances
154
+ if terminateable_instances := await _find_terminateable_instances (
155
+ app , broken_instances
156
+ ):
157
+ _logger .error (
158
+ "The following clusters'primary EC2 were found as unresponsive "
159
+ "(TIP: there is something wrong here, please inform support) and will be terminated now: '%s" ,
160
+ f"{ [i .id for i in terminateable_instances ]} " ,
161
+ )
162
+ await delete_clusters (app , instances = terminateable_instances )
0 commit comments