diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index eafea6a4ec1d..fd5a9735bfaf 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -306,6 +306,9 @@ sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker +sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker + # Install custom-built openssh sshd sudo dpkg --root=$FILESYSTEM_ROOT -i $debs_path/openssh-server_*.deb diff --git a/files/image_config/monit/conf.d/sonic-host b/files/image_config/monit/conf.d/sonic-host index 202c49f8d7b2..17d7c64af7e2 100644 --- a/files/image_config/monit/conf.d/sonic-host +++ b/files/image_config/monit/conf.d/sonic-host @@ -31,3 +31,5 @@ check program routeCheck with path "/usr/local/bin/route_check.py" every 5 cycles if status != 0 for 3 cycle then alert repeat every 1 cycles +check program container_checker with path "/usr/bin/container_checker" + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker new file mode 100755 index 000000000000..abfbb34430bb --- /dev/null +++ b/files/image_config/monit/container_checker @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +""" +container_checker + +This script is intended to be run by Monit. It will write an alerting message into +syslog if it found containers which were expected to run but were not running. At +the same time, if some containers were unexpected to run, it also writes an alerting +syslog message. Note that if print(...) statement in this script was executed, the +string in it will be appended to Monit syslog messages. + +The following is an example in Monit configuration file to show how Monit will run +this script: + +check program container_checker with path "/usr/bin/container_checker" + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles +""" + +import subprocess +import sys + +import swsssdk +from sonic_py_common import multi_asic + + +def get_command_result(command): + """ + @summary: This function will execute the command and return the resulting output. + @return: A string which contains the output of command. + """ + command_stdout = "" + + try: + proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True, universal_newlines=True) + command_stdout, command_stderr = proc_instance.communicate() + if proc_instance.returncode != 0: + print("Failed to execute the command '{}'. Return code: '{}'".format( + command, proc_instance.returncode)) + sys.exit(1) + except (OSError, ValueError) as err: + print("Failed to execute the command '{}'. Error: '{}'".format(command, err)) + sys.exit(2) + + return command_stdout.rstrip().split("\n") + + +def get_expected_running_containers(): + """ + @summary: This function will get the expected running containers by following the rule: + The 'state' field of container in 'FEATURE' table should not be 'disabled'. Then + if the device has Multi-ASIC, this function will get container list by determining the + value of field 'has_global_scope', the number of ASICs and the value of field + 'has_per_asic_scope'. If the device has single ASIC, the container name was put into + the list. + @return: A set which contains the expected running containers. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + + expected_running_containers = set() + + for container_name in feature_table.keys(): + if feature_table[container_name]["state"] != "disabled": + if multi_asic.is_multi_asic(): + if feature_table[container_name]["has_global_scope"] == "True": + expected_running_containers.add(container_name) + if feature_table[container_name]["has_per_asic_scope"] == "True": + num_asics = multi_asic.get_num_asics() + for asic_id in range(num_asics): + expected_running_containers.add(container_name + str(asic_id)) + else: + expected_running_containers.add(container_name) + + return expected_running_containers + + +def get_current_running_containers(): + """ + @summary: This function will get the current running container list by analyzing the + output of command `docker ps`. + @return: A set which contains the current running contianers. + """ + running_containers = set() + + command = "docker ps" + command_stdout = get_command_result(command) + for line in command_stdout[1:]: + running_containers.add(line.split()[-1].strip()) + + return running_containers + + +def main(): + """ + @summary: This function will compare the difference between the current running containers + and the containers which were expected to run. If containers which were exepcted + to run were not running, then an alerting message will be written into syslog. + """ + expected_running_containers = get_expected_running_containers() + current_running_containers = get_current_running_containers() + + not_running_containers = expected_running_containers.difference(current_running_containers) + if not_running_containers: + print("Expected containers not running: " + ", ".join(not_running_containers)) + sys.exit(3) + + unexpected_running_containers = current_running_containers.difference(expected_running_containers) + if unexpected_running_containers: + print("Unexpected running containers: " + ", ".join(unexpected_running_containers)) + sys.exit(4) + + +if __name__ == "__main__": + main()