1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ container_checker
5
+
6
+ This script is intended to be run by Monit. It will write an alerting message into
7
+ syslog if it found containers which were expected to run but were not running. At
8
+ the same time, if some containers were unexpected to run, it also writes an alerting
9
+ syslog message. Note that if print(...) statement in this script was executed, the
10
+ string in it will be appended to Monit syslog messages.
11
+
12
+ The following is an example in Monit configuration file to show how Monit will run
13
+ this script:
14
+
15
+ check program container_checker with path "/usr/bin/container_checker"
16
+ if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
17
+ """
18
+
19
+ import docker
20
+ import sys
21
+
22
+ from sonic_py_common import multi_asic , device_info
23
+ from swsscommon import swsscommon
24
+
25
+ EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
26
+ EVENTS_PUBLISHER_TAG = "event-down-ctr"
27
+
28
+ def check_docker_image (image_name ):
29
+ """
30
+ @summary: This function will check if docker image exists.
31
+ @return: True if the image exists, otherwise False.
32
+ """
33
+ try :
34
+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
35
+ DOCKER_CLIENT .images .get (image_name )
36
+ return True
37
+ except (docker .errors .ImageNotFound , docker .errors .APIError ) as err :
38
+ return False
39
+
40
+ def get_expected_running_containers ():
41
+ """
42
+ @summary: This function will get the expected running & always-enabled containers by following the rule:
43
+ The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
44
+ If the device has Multi-ASIC, this function will get container list by determining the
45
+ value of field 'has_global_scope', the number of ASICs and the value of field
46
+ 'has_per_asic_scope'.
47
+ If the device has single ASIC, the container name was put into the list.
48
+ @return: A set which contains the expected running containers and a set that has
49
+ containers marked as "always_enabled".
50
+ """
51
+ config_db = swsscommon .ConfigDBConnector ()
52
+ config_db .connect ()
53
+ feature_table = config_db .get_table ("FEATURE" )
54
+
55
+ expected_running_containers = set ()
56
+ always_running_containers = set ()
57
+
58
+ # Get current asic presence list. For multi_asic system, multi instance containers
59
+ # should be checked only for asics present.
60
+ asics_id_presence = multi_asic .get_asic_presence_list ()
61
+
62
+ # Some services may run all the instances irrespective of asic presence.
63
+ # Add those to exception list.
64
+ # database service: Currently services have dependency on all database services to
65
+ # be up irrespective of asic presence.
66
+ # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
67
+ # it will be removed from exception list.
68
+ run_all_instance_list = ['database' , 'bgp' ]
69
+
70
+ container_list = []
71
+ for container_name in feature_table .keys ():
72
+ # skip frr_bmp since it's not container just bmp option used by bgpd
73
+ if container_name == "frr_bmp" :
74
+ continue
75
+ # slim image does not have telemetry container and corresponding docker image
76
+ if container_name == "telemetry" :
77
+ ret = check_docker_image ("docker-sonic-telemetry" )
78
+ if not ret :
79
+ # If telemetry container image is not present, check gnmi container image
80
+ # If gnmi container image is not present, ignore telemetry container check
81
+ # if gnmi container image is present, check gnmi container instead of telemetry
82
+ ret = check_docker_image ("docker-sonic-gnmi" )
83
+ if not ret :
84
+ print ("Ignoring telemetry container check on image which has no corresponding telemetry or gnmi docker image" )
85
+ else :
86
+ container_list .append ("gnmi" )
87
+ continue
88
+ container_list .append (container_name )
89
+
90
+ for container_name in container_list :
91
+ if feature_table [container_name ]["state" ] not in ["disabled" , "always_disabled" ]:
92
+ if multi_asic .is_multi_asic ():
93
+ if feature_table [container_name ].get ("has_global_scope" , "True" ) == "True" :
94
+ expected_running_containers .add (container_name )
95
+ if feature_table [container_name ].get ("has_per_asic_scope" , "False" ) == "True" :
96
+ num_asics = multi_asic .get_num_asics ()
97
+ for asic_id in range (num_asics ):
98
+ if asic_id in asics_id_presence or container_name in run_all_instance_list :
99
+ expected_running_containers .add (container_name + str (asic_id ))
100
+ else :
101
+ expected_running_containers .add (container_name )
102
+ if feature_table [container_name ]["state" ] == 'always_enabled' :
103
+ if multi_asic .is_multi_asic ():
104
+ if feature_table [container_name ].get ("has_global_scope" , "True" ) == "True" :
105
+ always_running_containers .add (container_name )
106
+ if feature_table [container_name ].get ("has_per_asic_scope" , "False" ) == "True" :
107
+ num_asics = multi_asic .get_num_asics ()
108
+ for asic_id in range (num_asics ):
109
+ if asic_id in asics_id_presence or container_name in run_all_instance_list :
110
+ always_running_containers .add (container_name + str (asic_id ))
111
+ else :
112
+ always_running_containers .add (container_name )
113
+
114
+ if device_info .is_supervisor () or device_info .is_disaggregated_chassis () or device_info .is_smartswitch ():
115
+ always_running_containers .add ("database-chassis" )
116
+
117
+ if device_info .is_smartswitch ():
118
+ raw_dpustable = config_db .get_table ("DPUS" )
119
+ for dpu_name in raw_dpustable :
120
+ container_name = f"databasedpu{ dpu_name .replace ('dpu' , '' )} "
121
+ always_running_containers .add (container_name )
122
+
123
+ return expected_running_containers , always_running_containers
124
+
125
+ def get_current_running_from_DB (always_running_containers ):
126
+ """
127
+ @summary: This function will get the current running container list
128
+ from FEATURE table @ STATE_DB, if this table is available.
129
+ @return: a tuple
130
+ First: Return value indicating if info can be obtained from
131
+ DB or not.
132
+ Second: A set which contains the current running containers,
133
+ if this info is available in DB.
134
+ """
135
+ running_containers = set ()
136
+
137
+ state_db = swsscommon .DBConnector ("STATE_DB" , 0 )
138
+ tbl = swsscommon .Table (state_db , "FEATURE" )
139
+ if not tbl .getKeys ():
140
+ return running_containers
141
+
142
+ for name in tbl .getKeys ():
143
+ data = dict (tbl .get (name )[1 ])
144
+ if data .get ('container_id' ):
145
+ running_containers .add (name )
146
+
147
+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
148
+ RUNNING = 'running'
149
+ for name in always_running_containers :
150
+ try :
151
+ container = DOCKER_CLIENT .containers .get (name )
152
+ container_state = container .attrs .get ('State' , {})
153
+ if container_state .get ('Status' , "" ) == RUNNING :
154
+ running_containers .add (name )
155
+ except (docker .errors .NotFound , docker .errors .APIError ) as err :
156
+ print ("Failed to get container '{}'. Error: '{}'" .format (name , err ))
157
+ pass
158
+
159
+ return running_containers
160
+
161
+
162
+ def get_current_running_from_dockers ():
163
+ """
164
+ @summary: This function will get all running containers from
165
+ the list of docker containers in running state.
166
+ @return: A set which contains containers that are
167
+ in running state.
168
+ """
169
+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
170
+ running_containers = set ()
171
+ ctrs = DOCKER_CLIENT .containers
172
+ try :
173
+ lst = ctrs .list (filters = {"status" : "running" })
174
+ for ctr in lst :
175
+ # Prefer raw_container_name label over actual name
176
+ if ctr .labels and "raw_container_name" in ctr .labels :
177
+ running_containers .add (ctr .labels ["raw_container_name" ])
178
+ else :
179
+ running_containers .add (ctr .name )
180
+ except docker .errors .APIError as err :
181
+ print (f"Failed to retrieve the running container list. Error: '{ err } '" )
182
+
183
+ return running_containers
184
+
185
+
186
+ def get_current_running_containers (always_running_containers ):
187
+ """
188
+ @summary: This function will get the list of currently running containers.
189
+ If available in STATE-DB, get from DB else from list of dockers.
190
+
191
+ @return: A set of currently running containers.
192
+ """
193
+
194
+ current_running_containers = get_current_running_from_DB (always_running_containers )
195
+ current_running_containers .update (get_current_running_from_dockers ())
196
+ return current_running_containers
197
+
198
+
199
+ def publish_events (lst ):
200
+ events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
201
+ params = swsscommon .FieldValueMap ()
202
+
203
+ for ctr in lst :
204
+ params ["ctr_name" ] = ctr ;
205
+ swsscommon .event_publish (events_handle , EVENTS_PUBLISHER_TAG , params )
206
+
207
+ swsscommon .events_deinit_publisher (events_handle )
208
+
209
+
210
+ def main ():
211
+ """
212
+ @summary: This function will compare the difference between the current running containers
213
+ and the containers which were expected to run. If containers which were exepcted
214
+ to run were not running, then an alerting message will be written into syslog.
215
+ """
216
+ expected_running_containers , always_running_containers = get_expected_running_containers ()
217
+ current_running_containers = get_current_running_containers (always_running_containers )
218
+
219
+ expected_running_containers |= always_running_containers
220
+ not_running_containers = expected_running_containers .difference (current_running_containers )
221
+ if not_running_containers :
222
+ publish_events (not_running_containers )
223
+ print ("Expected containers not running: " + ", " .join (not_running_containers ))
224
+ sys .exit (3 )
225
+
226
+
227
+ if __name__ == "__main__" :
228
+ main ()
229
+ sys .exit (0 )
0 commit comments