From 46cb307392df29522968f50ba3cb3877ac0b44fb Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 22 May 2023 03:14:54 +0000 Subject: [PATCH 1/3] Merge watchdog code to proc exit listener --- dockers/docker-orchagent/docker-init.j2 | 1 + dockers/docker-orchagent/supervisord.conf.j2 | 1 + .../docker-orchagent/watchdog_processes.j2 | 1 + files/scripts/supervisor-proc-exit-listener | 152 +++++++++++------- 4 files changed, 96 insertions(+), 59 deletions(-) create mode 100644 dockers/docker-orchagent/watchdog_processes.j2 diff --git a/dockers/docker-orchagent/docker-init.j2 b/dockers/docker-orchagent/docker-init.j2 index 46db99a3166f..bea9befc0b6f 100755 --- a/dockers/docker-orchagent/docker-init.j2 +++ b/dockers/docker-orchagent/docker-init.j2 @@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \ -t /usr/share/sonic/templates/vlan_vars.j2 \ -t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \ -t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \ + -t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \ -t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf -t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \ " diff --git a/dockers/docker-orchagent/supervisord.conf.j2 b/dockers/docker-orchagent/supervisord.conf.j2 index eef040ee1784..498c8a48eb45 100644 --- a/dockers/docker-orchagent/supervisord.conf.j2 +++ b/dockers/docker-orchagent/supervisord.conf.j2 @@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh priority=4 autostart=false autorestart=false +stdout_capture_maxbytes=1MB stdout_logfile=syslog stderr_logfile=syslog dependent_startup=true diff --git a/dockers/docker-orchagent/watchdog_processes.j2 b/dockers/docker-orchagent/watchdog_processes.j2 new file mode 100644 index 000000000000..bbe7c3a734b3 --- /dev/null +++ b/dockers/docker-orchagent/watchdog_processes.j2 @@ -0,0 +1 @@ +program:orchagent \ No newline at end of file diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index dbfdaf2c5ac5..877d1a3ed6bf 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -13,6 +13,16 @@ from collections import defaultdict from swsscommon import swsscommon from supervisor import childutils +from supervisor.events import EventTypes, getEventNameByType + +PROCESS_COMMUNICATION_STDOUT = \ + getEventNameByType(EventTypes.PROCESS_COMMUNICATION_STDOUT) + +# Each line of this file should specify one process, (as defined in supervisord.conf file), in the +# following format: +# +# program: +WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes' # Each line of this file should specify either one critical process or one # critical process group, (as defined in supervisord.conf file), in the @@ -34,15 +44,15 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" -def get_critical_group_and_process_list(): +def get_group_and_process_list(process_file): """ - @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE. + @summary: Read the critical processes/group names. @return: Two lists which contain critical processes and group names respectively. """ - critical_group_list = [] - critical_process_list = [] + group_list = [] + process_list = [] - with open(CRITICAL_PROCESSES_FILE, 'r') as file: + with open(process_file, 'r') as file: for line in file: # ignore blank lines if re.match(r"^\s*$", line): @@ -50,24 +60,24 @@ def get_critical_group_and_process_list(): line_info = line.strip(' \n').split(':') if len(line_info) != 2: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) sys.exit(5) identifier_key = line_info[0].strip() identifier_value = line_info[1].strip() if identifier_key == "group" and identifier_value: - critical_group_list.append(identifier_value) + group_list.append(identifier_value) elif identifier_key == "program" and identifier_value: - critical_process_list.append(identifier_value) + process_list.append(identifier_value) else: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) sys.exit(6) - return critical_group_list, critical_process_list + return group_list, process_list -def generate_alerting_message(process_name, dead_minutes): +def generate_alerting_message(process_name, status, dead_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host or in a specific namespace. Then an alerting message will be written into syslog. @@ -80,8 +90,8 @@ def generate_alerting_message(process_name, dead_minutes): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)." - .format(process_name, namespace, dead_minutes)) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." + .format(process_name, status, namespace, dead_minutes)) def get_autorestart_state(container_name): @@ -125,63 +135,87 @@ def main(argv): syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") sys.exit(1) - critical_group_list, critical_process_list = get_critical_group_and_process_list() + critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) + _, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE) process_under_alerting = defaultdict(dict) + process_heart_beat_info = defaultdict(dict) # Transition from ACKNOWLEDGED to READY childutils.listener.ready() events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) while True: - file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0] - if len(file_descriptor_list) > 0: - line = file_descriptor_list[0].readline() - headers = childutils.get_headers(line) - payload = sys.stdin.read(int(headers['len'])) - - # Handle the PROCESS_STATE_EXITED event - if headers['eventname'] == 'PROCESS_STATE_EXITED': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - - expected = int(payload_headers['expected']) - process_name = payload_headers['processname'] - group_name = payload_headers['groupname'] - - if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0: - is_auto_restart = get_autorestart_state(container_name) - if is_auto_restart != "disabled": - MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'" - msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name) - syslog.syslog(syslog.LOG_INFO, msg) - publish_events(events_handle, payload_headers['processname'], container_name) - swsscommon.events_deinit_publisher(events_handle) - os.kill(os.getppid(), signal.SIGTERM) - else: - process_under_alerting[process_name]["last_alerted"] = time.time() - process_under_alerting[process_name]["dead_minutes"] = 0 - - # Handle the PROCESS_STATE_RUNNING event - elif headers['eventname'] == 'PROCESS_STATE_RUNNING': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - process_name = payload_headers['processname'] - - if process_name in process_under_alerting: - process_under_alerting.pop(process_name) - + try: + file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0] + if len(file_descriptor_list) > 0: + line = file_descriptor_list[0].readline() + headers = childutils.get_headers(line) + payload = sys.stdin.read(int(headers['len'])) + + # Handle the PROCESS_STATE_EXITED event + if headers['eventname'] == 'PROCESS_STATE_EXITED': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + + expected = int(payload_headers['expected']) + process_name = payload_headers['processname'] + group_name = payload_headers['groupname'] + + if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0: + is_auto_restart = get_autorestart_state(container_name) + if is_auto_restart != "disabled": + MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'" + msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name) + syslog.syslog(syslog.LOG_INFO, msg) + publish_events(events_handle, payload_headers['processname'], container_name) + swsscommon.events_deinit_publisher(events_handle) + os.kill(os.getppid(), signal.SIGTERM) + else: + process_under_alerting[process_name]["last_alerted"] = time.time() + process_under_alerting[process_name]["dead_minutes"] = 0 + + # Handle the PROCESS_STATE_RUNNING event + elif headers['eventname'] == 'PROCESS_STATE_RUNNING': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + process_name = payload_headers['processname'] + + if process_name in process_under_alerting: + process_under_alerting.pop(process_name) + + # Handle the PROCESS_COMMUNICATION_STDOUT event + elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + process_name = payload_headers['processname'] + + # update process heart beat time + if (process_name in watch_process_list): + process_heart_beat_info[process_name]["last_heart_beat"] = time.time() + + + # Check whether we need write alerting messages into syslog + for process_name in process_under_alerting.keys(): + epoch_time = time.time() + elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] + if elapsed_secs >= ALERTING_INTERVAL_SECS: + elapsed_mins = elapsed_secs // 60 + process_under_alerting[process_name]["last_alerted"] = epoch_time + process_under_alerting[process_name]["dead_minutes"] += elapsed_mins + generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) + + # Check whether we need write alerting messages into syslog + for process in process_heart_beat_info.keys(): + epoch_time = time.time() + elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] + if elapsed_secs >= ALERTING_INTERVAL_SECS: + elapsed_mins = elapsed_secs // 60 + generate_alerting_message(process, "stuck", elapsed_mins) + + except Exception as ex: + syslog.syslog(syslog.LOG_ERR, "Exception: {}".format(ex)) + finally: # Transition from BUSY to ACKNOWLEDGED childutils.listener.ok() # Transition from ACKNOWLEDGED to READY childutils.listener.ready() - # Check whether we need write alerting messages into syslog - for process_name in process_under_alerting.keys(): - epoch_time = time.time() - elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 - process_under_alerting[process_name]["last_alerted"] = epoch_time - process_under_alerting[process_name]["dead_minutes"] += elapsed_mins - generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"]) - if __name__ == "__main__": main(sys.argv[1:]) From 4f244af0b85e675254ad9e3fa3564da620e2d209 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Tue, 23 May 2023 02:48:53 +0000 Subject: [PATCH 2/3] Fix code --- dockers/docker-orchagent/supervisord.conf.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/docker-orchagent/supervisord.conf.j2 b/dockers/docker-orchagent/supervisord.conf.j2 index 498c8a48eb45..89d966d479c5 100644 --- a/dockers/docker-orchagent/supervisord.conf.j2 +++ b/dockers/docker-orchagent/supervisord.conf.j2 @@ -14,7 +14,7 @@ buffer_size=1024 [eventlistener:supervisor-proc-exit-listener] command=/usr/bin/supervisor-proc-exit-listener --container-name swss -events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING +events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT,TICK_5 autostart=true autorestart=unexpected buffer_size=1024 From c0a95a4bb8ab195dcde0986819ec68f195b5b855 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Tue, 23 May 2023 03:42:18 +0000 Subject: [PATCH 3/3] Fix code issue --- dockers/docker-orchagent/supervisord.conf.j2 | 2 +- files/scripts/supervisor-proc-exit-listener | 133 +++++++++---------- 2 files changed, 63 insertions(+), 72 deletions(-) diff --git a/dockers/docker-orchagent/supervisord.conf.j2 b/dockers/docker-orchagent/supervisord.conf.j2 index 89d966d479c5..6c822f464a8d 100644 --- a/dockers/docker-orchagent/supervisord.conf.j2 +++ b/dockers/docker-orchagent/supervisord.conf.j2 @@ -14,7 +14,7 @@ buffer_size=1024 [eventlistener:supervisor-proc-exit-listener] command=/usr/bin/supervisor-proc-exit-listener --container-name swss -events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT,TICK_5 +events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT autostart=true autorestart=unexpected buffer_size=1024 diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 877d1a3ed6bf..9ba29466a913 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -13,10 +13,6 @@ from collections import defaultdict from swsscommon import swsscommon from supervisor import childutils -from supervisor.events import EventTypes, getEventNameByType - -PROCESS_COMMUNICATION_STDOUT = \ - getEventNameByType(EventTypes.PROCESS_COMMUNICATION_STDOUT) # Each line of this file should specify one process, (as defined in supervisord.conf file), in the # following format: @@ -144,78 +140,73 @@ def main(argv): childutils.listener.ready() events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) while True: - try: - file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0] - if len(file_descriptor_list) > 0: - line = file_descriptor_list[0].readline() - headers = childutils.get_headers(line) - payload = sys.stdin.read(int(headers['len'])) - - # Handle the PROCESS_STATE_EXITED event - if headers['eventname'] == 'PROCESS_STATE_EXITED': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - - expected = int(payload_headers['expected']) - process_name = payload_headers['processname'] - group_name = payload_headers['groupname'] - - if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0: - is_auto_restart = get_autorestart_state(container_name) - if is_auto_restart != "disabled": - MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'" - msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name) - syslog.syslog(syslog.LOG_INFO, msg) - publish_events(events_handle, payload_headers['processname'], container_name) - swsscommon.events_deinit_publisher(events_handle) - os.kill(os.getppid(), signal.SIGTERM) - else: - process_under_alerting[process_name]["last_alerted"] = time.time() - process_under_alerting[process_name]["dead_minutes"] = 0 - - # Handle the PROCESS_STATE_RUNNING event - elif headers['eventname'] == 'PROCESS_STATE_RUNNING': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - process_name = payload_headers['processname'] - - if process_name in process_under_alerting: - process_under_alerting.pop(process_name) - - # Handle the PROCESS_COMMUNICATION_STDOUT event - elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - process_name = payload_headers['processname'] - - # update process heart beat time - if (process_name in watch_process_list): - process_heart_beat_info[process_name]["last_heart_beat"] = time.time() - - - # Check whether we need write alerting messages into syslog - for process_name in process_under_alerting.keys(): - epoch_time = time.time() - elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 - process_under_alerting[process_name]["last_alerted"] = epoch_time - process_under_alerting[process_name]["dead_minutes"] += elapsed_mins - generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) - - # Check whether we need write alerting messages into syslog - for process in process_heart_beat_info.keys(): - epoch_time = time.time() - elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 - generate_alerting_message(process, "stuck", elapsed_mins) - - except Exception as ex: - syslog.syslog(syslog.LOG_ERR, "Exception: {}".format(ex)) - finally: + file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0] + if len(file_descriptor_list) > 0: + line = file_descriptor_list[0].readline() + headers = childutils.get_headers(line) + payload = sys.stdin.read(int(headers['len'])) + + # Handle the PROCESS_STATE_EXITED event + if headers['eventname'] == 'PROCESS_STATE_EXITED': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + + expected = int(payload_headers['expected']) + process_name = payload_headers['processname'] + group_name = payload_headers['groupname'] + + if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0: + is_auto_restart = get_autorestart_state(container_name) + if is_auto_restart != "disabled": + MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'" + msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name) + syslog.syslog(syslog.LOG_INFO, msg) + publish_events(events_handle, payload_headers['processname'], container_name) + swsscommon.events_deinit_publisher(events_handle) + os.kill(os.getppid(), signal.SIGTERM) + else: + process_under_alerting[process_name]["last_alerted"] = time.time() + process_under_alerting[process_name]["dead_minutes"] = 0 + + # Handle the PROCESS_STATE_RUNNING event + elif headers['eventname'] == 'PROCESS_STATE_RUNNING': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + process_name = payload_headers['processname'] + + if process_name in process_under_alerting: + process_under_alerting.pop(process_name) + + # Handle the PROCESS_COMMUNICATION_STDOUT event + elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + process_name = payload_headers['processname'] + + # update process heart beat time + if (process_name in watch_process_list): + process_heart_beat_info[process_name]["last_heart_beat"] = time.time() + # Transition from BUSY to ACKNOWLEDGED childutils.listener.ok() # Transition from ACKNOWLEDGED to READY childutils.listener.ready() + # Check whether we need write alerting messages into syslog + for process_name in process_under_alerting.keys(): + epoch_time = time.time() + elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] + if elapsed_secs >= ALERTING_INTERVAL_SECS: + elapsed_mins = elapsed_secs // 60 + process_under_alerting[process_name]["last_alerted"] = epoch_time + process_under_alerting[process_name]["dead_minutes"] += elapsed_mins + generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) + + # Check whether we need write alerting messages into syslog + for process in process_heart_beat_info.keys(): + epoch_time = time.time() + elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] + if elapsed_secs >= ALERTING_INTERVAL_SECS: + elapsed_mins = elapsed_secs // 60 + generate_alerting_message(process, "stuck", elapsed_mins) + if __name__ == "__main__": main(sys.argv[1:])