Skip to content

Commit d94ea36

Browse files
committed
Add sonic-telemetry-sidecar container
1 parent c58f76c commit d94ea36

File tree

7 files changed

+794
-0
lines changed

7 files changed

+794
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %}
2+
ARG BASE=docker-config-engine-bookworm-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}}
3+
4+
FROM $BASE AS base
5+
6+
ARG docker_container_name
7+
ARG image_version
8+
RUN [ -f /etc/rsyslog.conf ] && sed -ri "s/%syslogtag%/$docker_container_name#%syslogtag%/;" /etc/rsyslog.conf
9+
10+
# Make apt-get non-interactive
11+
ENV DEBIAN_FRONTEND=noninteractive
12+
13+
# Pass the image_version to container
14+
ENV IMAGE_VERSION=$image_version
15+
16+
COPY ["systemd_stub.py", "/usr/bin/"]
17+
COPY ["systemd_scripts/", "/usr/share/sonic/systemd_scripts/"]
18+
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
19+
20+
RUN chmod +x /usr/bin/systemd_stub.py
21+
22+
FROM $BASE
23+
24+
RUN --mount=type=bind,from=base,target=/changes-to-image rsync -axAX --no-D --exclude=/sys --exclude=/proc --exclude=/dev --exclude=resolv.conf /changes-to-image/ /
25+
26+
# Make apt-get non-interactive
27+
ENV DEBIAN_FRONTEND=noninteractive
28+
29+
# Pass the image_version to container
30+
ENV IMAGE_VERSION=$image_version
31+
32+
ENTRYPOINT ["/usr/local/bin/supervisord"]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[supervisord]
2+
logfile_maxbytes=1MB
3+
logfile_backups=2
4+
nodaemon=true
5+
6+
[eventlistener:dependent-startup]
7+
command=python3 -m supervisord_dependent_startup
8+
autostart=true
9+
autorestart=unexpected
10+
startretries=0
11+
exitcodes=0,3
12+
events=PROCESS_STATE
13+
buffer_size=1024
14+
15+
[program:rsyslogd]
16+
command=/usr/sbin/rsyslogd -n -iNONE
17+
priority=1
18+
autostart=false
19+
autorestart=unexpected
20+
stdout_logfile=NONE
21+
stdout_syslog=true
22+
stderr_logfile=NONE
23+
stderr_syslog=true
24+
dependent_startup=true
25+
26+
[program:systemd_stub]
27+
command=python3 /usr/bin/systemd_stub.py
28+
priority=3
29+
autostart=false
30+
autorestart=false
31+
startsecs=0
32+
stdout_logfile=NONE
33+
stdout_syslog=true
34+
stderr_logfile=NONE
35+
stderr_syslog=true
36+
dependent_startup=true
37+
dependent_startup_wait_for=rsyslogd:running
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
container_checker
5+
6+
This script is intended to be run by Monit. It will write an alerting message into
7+
syslog if it found containers which were expected to run but were not running. At
8+
the same time, if some containers were unexpected to run, it also writes an alerting
9+
syslog message. Note that if print(...) statement in this script was executed, the
10+
string in it will be appended to Monit syslog messages.
11+
12+
The following is an example in Monit configuration file to show how Monit will run
13+
this script:
14+
15+
check program container_checker with path "/usr/bin/container_checker"
16+
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
17+
"""
18+
19+
import docker
20+
import sys
21+
22+
from sonic_py_common import multi_asic, device_info
23+
from swsscommon import swsscommon
24+
25+
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
26+
EVENTS_PUBLISHER_TAG = "event-down-ctr"
27+
28+
def check_docker_image(image_name):
29+
"""
30+
@summary: This function will check if docker image exists.
31+
@return: True if the image exists, otherwise False.
32+
"""
33+
try:
34+
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
35+
DOCKER_CLIENT.images.get(image_name)
36+
return True
37+
except (docker.errors.ImageNotFound, docker.errors.APIError) as err:
38+
return False
39+
40+
def get_expected_running_containers():
41+
"""
42+
@summary: This function will get the expected running & always-enabled containers by following the rule:
43+
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
44+
If the device has Multi-ASIC, this function will get container list by determining the
45+
value of field 'has_global_scope', the number of ASICs and the value of field
46+
'has_per_asic_scope'.
47+
If the device has single ASIC, the container name was put into the list.
48+
@return: A set which contains the expected running containers and a set that has
49+
containers marked as "always_enabled".
50+
"""
51+
config_db = swsscommon.ConfigDBConnector()
52+
config_db.connect()
53+
feature_table = config_db.get_table("FEATURE")
54+
55+
expected_running_containers = set()
56+
always_running_containers = set()
57+
58+
# Get current asic presence list. For multi_asic system, multi instance containers
59+
# should be checked only for asics present.
60+
asics_id_presence = multi_asic.get_asic_presence_list()
61+
62+
# Some services may run all the instances irrespective of asic presence.
63+
# Add those to exception list.
64+
# database service: Currently services have dependency on all database services to
65+
# be up irrespective of asic presence.
66+
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
67+
# it will be removed from exception list.
68+
run_all_instance_list = ['database', 'bgp']
69+
70+
container_list = []
71+
for container_name in feature_table.keys():
72+
# skip frr_bmp since it's not container just bmp option used by bgpd
73+
if container_name == "frr_bmp":
74+
continue
75+
# slim image does not have telemetry container and corresponding docker image
76+
if container_name == "telemetry":
77+
ret = check_docker_image("docker-sonic-telemetry")
78+
if not ret:
79+
# If telemetry container image is not present, check gnmi container image
80+
# If gnmi container image is not present, ignore telemetry container check
81+
# if gnmi container image is present, check gnmi container instead of telemetry
82+
ret = check_docker_image("docker-sonic-gnmi")
83+
if not ret:
84+
print("Ignoring telemetry container check on image which has no corresponding telemetry or gnmi docker image")
85+
else:
86+
container_list.append("gnmi")
87+
continue
88+
container_list.append(container_name)
89+
90+
for container_name in container_list:
91+
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
92+
if multi_asic.is_multi_asic():
93+
if feature_table[container_name].get("has_global_scope", "True") == "True":
94+
expected_running_containers.add(container_name)
95+
if feature_table[container_name].get("has_per_asic_scope", "False") == "True":
96+
num_asics = multi_asic.get_num_asics()
97+
for asic_id in range(num_asics):
98+
if asic_id in asics_id_presence or container_name in run_all_instance_list:
99+
expected_running_containers.add(container_name + str(asic_id))
100+
else:
101+
expected_running_containers.add(container_name)
102+
if feature_table[container_name]["state"] == 'always_enabled':
103+
if multi_asic.is_multi_asic():
104+
if feature_table[container_name].get("has_global_scope", "True") == "True":
105+
always_running_containers.add(container_name)
106+
if feature_table[container_name].get("has_per_asic_scope", "False") == "True":
107+
num_asics = multi_asic.get_num_asics()
108+
for asic_id in range(num_asics):
109+
if asic_id in asics_id_presence or container_name in run_all_instance_list:
110+
always_running_containers.add(container_name + str(asic_id))
111+
else:
112+
always_running_containers.add(container_name)
113+
114+
if device_info.is_supervisor() or device_info.is_disaggregated_chassis() or device_info.is_smartswitch():
115+
always_running_containers.add("database-chassis")
116+
117+
if device_info.is_smartswitch():
118+
raw_dpustable = config_db.get_table("DPUS")
119+
for dpu_name in raw_dpustable:
120+
container_name = f"databasedpu{dpu_name.replace('dpu', '')}"
121+
always_running_containers.add(container_name)
122+
123+
return expected_running_containers, always_running_containers
124+
125+
def get_current_running_from_DB(always_running_containers):
126+
"""
127+
@summary: This function will get the current running container list
128+
from FEATURE table @ STATE_DB, if this table is available.
129+
@return: a tuple
130+
First: Return value indicating if info can be obtained from
131+
DB or not.
132+
Second: A set which contains the current running containers,
133+
if this info is available in DB.
134+
"""
135+
running_containers = set()
136+
137+
state_db = swsscommon.DBConnector("STATE_DB", 0)
138+
tbl = swsscommon.Table(state_db, "FEATURE")
139+
if not tbl.getKeys():
140+
return running_containers
141+
142+
for name in tbl.getKeys():
143+
data = dict(tbl.get(name)[1])
144+
if data.get('container_id'):
145+
running_containers.add(name)
146+
147+
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
148+
RUNNING = 'running'
149+
for name in always_running_containers:
150+
try:
151+
container = DOCKER_CLIENT.containers.get(name)
152+
container_state = container.attrs.get('State', {})
153+
if container_state.get('Status', "") == RUNNING:
154+
running_containers.add(name)
155+
except (docker.errors.NotFound, docker.errors.APIError) as err:
156+
print("Failed to get container '{}'. Error: '{}'".format(name, err))
157+
pass
158+
159+
return running_containers
160+
161+
162+
def get_current_running_from_dockers():
163+
"""
164+
@summary: This function will get all running containers from
165+
the list of docker containers in running state.
166+
@return: A set which contains containers that are
167+
in running state.
168+
"""
169+
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
170+
running_containers = set()
171+
ctrs = DOCKER_CLIENT.containers
172+
try:
173+
lst = ctrs.list(filters={"status": "running"})
174+
for ctr in lst:
175+
# Prefer raw_container_name label over actual name
176+
if ctr.labels and "raw_container_name" in ctr.labels:
177+
running_containers.add(ctr.labels["raw_container_name"])
178+
else:
179+
running_containers.add(ctr.name)
180+
except docker.errors.APIError as err:
181+
print(f"Failed to retrieve the running container list. Error: '{err}'")
182+
183+
return running_containers
184+
185+
186+
def get_current_running_containers(always_running_containers):
187+
"""
188+
@summary: This function will get the list of currently running containers.
189+
If available in STATE-DB, get from DB else from list of dockers.
190+
191+
@return: A set of currently running containers.
192+
"""
193+
194+
current_running_containers = get_current_running_from_DB(always_running_containers)
195+
current_running_containers.update(get_current_running_from_dockers())
196+
return current_running_containers
197+
198+
199+
def publish_events(lst):
200+
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
201+
params = swsscommon.FieldValueMap()
202+
203+
for ctr in lst:
204+
params["ctr_name"] = ctr;
205+
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
206+
207+
swsscommon.events_deinit_publisher(events_handle)
208+
209+
210+
def main():
211+
"""
212+
@summary: This function will compare the difference between the current running containers
213+
and the containers which were expected to run. If containers which were exepcted
214+
to run were not running, then an alerting message will be written into syslog.
215+
"""
216+
expected_running_containers, always_running_containers = get_expected_running_containers()
217+
current_running_containers = get_current_running_containers(always_running_containers)
218+
219+
expected_running_containers |= always_running_containers
220+
not_running_containers = expected_running_containers.difference(current_running_containers)
221+
if not_running_containers:
222+
publish_events(not_running_containers)
223+
print("Expected containers not running: " + ", ".join(not_running_containers))
224+
sys.exit(3)
225+
226+
227+
if __name__ == "__main__":
228+
main()
229+
sys.exit(0)

0 commit comments

Comments
 (0)