1+ #!/usr/bin/env python3
2+
3+ """
4+ container_checker
5+
6+ This script is intended to be run by Monit. It will write an alerting message into
7+ syslog if it found containers which were expected to run but were not running. At
8+ the same time, if some containers were unexpected to run, it also writes an alerting
9+ syslog message. Note that if print(...) statement in this script was executed, the
10+ string in it will be appended to Monit syslog messages.
11+
12+ The following is an example in Monit configuration file to show how Monit will run
13+ this script:
14+
15+ check program container_checker with path "/usr/bin/container_checker"
16+ if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
17+ """
18+
19+ import docker
20+ import sys
21+
22+ from sonic_py_common import multi_asic , device_info
23+ from swsscommon import swsscommon
24+
25+ EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
26+ EVENTS_PUBLISHER_TAG = "event-down-ctr"
27+
28+ def check_docker_image (image_name ):
29+ """
30+ @summary: This function will check if docker image exists.
31+ @return: True if the image exists, otherwise False.
32+ """
33+ try :
34+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
35+ DOCKER_CLIENT .images .get (image_name )
36+ return True
37+ except (docker .errors .ImageNotFound , docker .errors .APIError ) as err :
38+ return False
39+
40+ def get_expected_running_containers ():
41+ """
42+ @summary: This function will get the expected running & always-enabled containers by following the rule:
43+ The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
44+ If the device has Multi-ASIC, this function will get container list by determining the
45+ value of field 'has_global_scope', the number of ASICs and the value of field
46+ 'has_per_asic_scope'.
47+ If the device has single ASIC, the container name was put into the list.
48+ @return: A set which contains the expected running containers and a set that has
49+ containers marked as "always_enabled".
50+ """
51+ config_db = swsscommon .ConfigDBConnector ()
52+ config_db .connect ()
53+ feature_table = config_db .get_table ("FEATURE" )
54+
55+ expected_running_containers = set ()
56+ always_running_containers = set ()
57+
58+ # Get current asic presence list. For multi_asic system, multi instance containers
59+ # should be checked only for asics present.
60+ asics_id_presence = multi_asic .get_asic_presence_list ()
61+
62+ # Some services may run all the instances irrespective of asic presence.
63+ # Add those to exception list.
64+ # database service: Currently services have dependency on all database services to
65+ # be up irrespective of asic presence.
66+ # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
67+ # it will be removed from exception list.
68+ run_all_instance_list = ['database' , 'bgp' ]
69+
70+ container_list = []
71+ for container_name in feature_table .keys ():
72+ # skip frr_bmp since it's not container just bmp option used by bgpd
73+ if container_name == "frr_bmp" :
74+ continue
75+ # slim image does not have telemetry container and corresponding docker image
76+ if container_name == "telemetry" :
77+ ret = check_docker_image ("docker-sonic-telemetry" )
78+ if not ret :
79+ # If telemetry container image is not present, check gnmi container image
80+ # If gnmi container image is not present, ignore telemetry container check
81+ # if gnmi container image is present, check gnmi container instead of telemetry
82+ ret = check_docker_image ("docker-sonic-gnmi" )
83+ if not ret :
84+ print ("Ignoring telemetry container check on image which has no corresponding telemetry or gnmi docker image" )
85+ else :
86+ container_list .append ("gnmi" )
87+ continue
88+ container_list .append (container_name )
89+
90+ for container_name in container_list :
91+ if feature_table [container_name ]["state" ] not in ["disabled" , "always_disabled" ]:
92+ if multi_asic .is_multi_asic ():
93+ if feature_table [container_name ].get ("has_global_scope" , "True" ) == "True" :
94+ expected_running_containers .add (container_name )
95+ if feature_table [container_name ].get ("has_per_asic_scope" , "False" ) == "True" :
96+ num_asics = multi_asic .get_num_asics ()
97+ for asic_id in range (num_asics ):
98+ if asic_id in asics_id_presence or container_name in run_all_instance_list :
99+ expected_running_containers .add (container_name + str (asic_id ))
100+ else :
101+ expected_running_containers .add (container_name )
102+ if feature_table [container_name ]["state" ] == 'always_enabled' :
103+ if multi_asic .is_multi_asic ():
104+ if feature_table [container_name ].get ("has_global_scope" , "True" ) == "True" :
105+ always_running_containers .add (container_name )
106+ if feature_table [container_name ].get ("has_per_asic_scope" , "False" ) == "True" :
107+ num_asics = multi_asic .get_num_asics ()
108+ for asic_id in range (num_asics ):
109+ if asic_id in asics_id_presence or container_name in run_all_instance_list :
110+ always_running_containers .add (container_name + str (asic_id ))
111+ else :
112+ always_running_containers .add (container_name )
113+
114+ if device_info .is_supervisor () or device_info .is_disaggregated_chassis () or device_info .is_smartswitch ():
115+ always_running_containers .add ("database-chassis" )
116+
117+ if device_info .is_smartswitch ():
118+ raw_dpustable = config_db .get_table ("DPUS" )
119+ for dpu_name in raw_dpustable :
120+ container_name = f"databasedpu{ dpu_name .replace ('dpu' , '' )} "
121+ always_running_containers .add (container_name )
122+
123+ return expected_running_containers , always_running_containers
124+
125+ def get_current_running_from_DB (always_running_containers ):
126+ """
127+ @summary: This function will get the current running container list
128+ from FEATURE table @ STATE_DB, if this table is available.
129+ @return: a tuple
130+ First: Return value indicating if info can be obtained from
131+ DB or not.
132+ Second: A set which contains the current running containers,
133+ if this info is available in DB.
134+ """
135+ running_containers = set ()
136+
137+ state_db = swsscommon .DBConnector ("STATE_DB" , 0 )
138+ tbl = swsscommon .Table (state_db , "FEATURE" )
139+ if not tbl .getKeys ():
140+ return running_containers
141+
142+ for name in tbl .getKeys ():
143+ data = dict (tbl .get (name )[1 ])
144+ if data .get ('container_id' ):
145+ running_containers .add (name )
146+
147+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
148+ RUNNING = 'running'
149+ for name in always_running_containers :
150+ try :
151+ container = DOCKER_CLIENT .containers .get (name )
152+ container_state = container .attrs .get ('State' , {})
153+ if container_state .get ('Status' , "" ) == RUNNING :
154+ running_containers .add (name )
155+ except (docker .errors .NotFound , docker .errors .APIError ) as err :
156+ print ("Failed to get container '{}'. Error: '{}'" .format (name , err ))
157+ pass
158+
159+ return running_containers
160+
161+
162+ def get_current_running_from_dockers ():
163+ """
164+ @summary: This function will get all running containers from
165+ the list of docker containers in running state.
166+ @return: A set which contains containers that are
167+ in running state.
168+ """
169+ DOCKER_CLIENT = docker .DockerClient (base_url = 'unix://var/run/docker.sock' )
170+ running_containers = set ()
171+ ctrs = DOCKER_CLIENT .containers
172+ try :
173+ lst = ctrs .list (filters = {"status" : "running" })
174+ for ctr in lst :
175+ # Prefer raw_container_name label over actual name
176+ if ctr .labels and "raw_container_name" in ctr .labels :
177+ running_containers .add (ctr .labels ["raw_container_name" ])
178+ else :
179+ running_containers .add (ctr .name )
180+ except docker .errors .APIError as err :
181+ print (f"Failed to retrieve the running container list. Error: '{ err } '" )
182+
183+ return running_containers
184+
185+
186+ def get_current_running_containers (always_running_containers ):
187+ """
188+ @summary: This function will get the list of currently running containers.
189+ If available in STATE-DB, get from DB else from list of dockers.
190+
191+ @return: A set of currently running containers.
192+ """
193+
194+ current_running_containers = get_current_running_from_DB (always_running_containers )
195+ current_running_containers .update (get_current_running_from_dockers ())
196+ return current_running_containers
197+
198+
199+ def publish_events (lst ):
200+ events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
201+ params = swsscommon .FieldValueMap ()
202+
203+ for ctr in lst :
204+ params ["ctr_name" ] = ctr ;
205+ swsscommon .event_publish (events_handle , EVENTS_PUBLISHER_TAG , params )
206+
207+ swsscommon .events_deinit_publisher (events_handle )
208+
209+
210+ def main ():
211+ """
212+ @summary: This function will compare the difference between the current running containers
213+ and the containers which were expected to run. If containers which were exepcted
214+ to run were not running, then an alerting message will be written into syslog.
215+ """
216+ expected_running_containers , always_running_containers = get_expected_running_containers ()
217+ current_running_containers = get_current_running_containers (always_running_containers )
218+
219+ expected_running_containers |= always_running_containers
220+ not_running_containers = expected_running_containers .difference (current_running_containers )
221+ if not_running_containers :
222+ publish_events (not_running_containers )
223+ print ("Expected containers not running: " + ", " .join (not_running_containers ))
224+ sys .exit (3 )
225+
226+
227+ if __name__ == "__main__" :
228+ main ()
229+ sys .exit (0 )
0 commit comments