From 9e29765fae93116fbaccc2172118aa9c19c5d642 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 22 Feb 2021 17:02:15 -0800 Subject: [PATCH 1/3] [Event listener] Deduplicate the alerting messages. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index e8565b4d52f2..f643b30f365c 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -8,6 +8,7 @@ import signal import sys import syslog import time +from collections import defaultdict import swsssdk @@ -64,7 +65,7 @@ def get_critical_group_and_process_list(): return critical_group_list, critical_process_list -def generate_alerting_message(process_name): +def generate_alerting_message(process_name, num_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host or in a specific namespace. Then an alerting message will be written into syslog. @@ -77,7 +78,8 @@ def generate_alerting_message(process_name): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'.".format(process_name, namespace)) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'({} minutes)." + .format(process_name, namespace, num_minutes)) def get_autorestart_state(container_name): @@ -118,7 +120,7 @@ def main(argv): critical_group_list, critical_process_list = get_critical_group_and_process_list() - process_under_alerting = {} + process_under_alerting = defaultdict(list) # Transition from ACKNOWLEDGED to READY childutils.listener.ready() @@ -145,7 +147,8 @@ def main(argv): syslog.syslog(syslog.LOG_INFO, msg) os.kill(os.getppid(), signal.SIGTERM) else: - process_under_alerting[process_name] = time.time() + process_under_alerting[process_name].append(time.time()) + process_under_alerting[process_name].append(0) # Handle the PROCESS_STATE_RUNNING event elif headers['eventname'] == 'PROCESS_STATE_RUNNING': @@ -164,9 +167,10 @@ def main(argv): # Check whether we need write alerting messages into syslog for process in process_under_alerting.keys(): epoch_time = time.time() - if epoch_time - process_under_alerting[process] >= ALERTING_INTERVAL_SECS: - process_under_alerting[process] = epoch_time - generate_alerting_message(process) + if epoch_time - process_under_alerting[process][0] >= ALERTING_INTERVAL_SECS: + process_under_alerting[process][0] = epoch_time + process_under_alerting[process][1] += 1 + generate_alerting_message(process, process_under_alerting[process][1]) if __name__ == "__main__": From bbd48c87851cc3fa774c1b856b8cb9c4e8d26f73 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 23 Feb 2021 12:26:07 -0800 Subject: [PATCH 2/3] [event listener] Made following changes: 1.Fix the format of alerting message. 2.For each exited process, there are two fields: the time of last alert and number of dead minutes. Use a dict to hold these two fields instead of a list. 3.Use a formula to calculate how many minutes the process was in dead state instead of hard code. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 24 +++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index f643b30f365c..432c81e7a700 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -65,7 +65,7 @@ def get_critical_group_and_process_list(): return critical_group_list, critical_process_list -def generate_alerting_message(process_name, num_minutes): +def generate_alerting_message(process_name, dead_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host or in a specific namespace. Then an alerting message will be written into syslog. @@ -78,8 +78,8 @@ def generate_alerting_message(process_name, num_minutes): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'({} minutes)." - .format(process_name, namespace, num_minutes)) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)." + .format(process_name, namespace, dead_minutes)) def get_autorestart_state(container_name): @@ -120,7 +120,7 @@ def main(argv): critical_group_list, critical_process_list = get_critical_group_and_process_list() - process_under_alerting = defaultdict(list) + process_under_alerting = defaultdict(dict) # Transition from ACKNOWLEDGED to READY childutils.listener.ready() @@ -147,8 +147,8 @@ def main(argv): syslog.syslog(syslog.LOG_INFO, msg) os.kill(os.getppid(), signal.SIGTERM) else: - process_under_alerting[process_name].append(time.time()) - process_under_alerting[process_name].append(0) + process_under_alerting[process_name]["last_alerted"] = time.time() + process_under_alerting[process_name]["dead_minutes"] = 0 # Handle the PROCESS_STATE_RUNNING event elif headers['eventname'] == 'PROCESS_STATE_RUNNING': @@ -165,12 +165,14 @@ def main(argv): childutils.listener.ready() # Check whether we need write alerting messages into syslog - for process in process_under_alerting.keys(): + for process_name in process_under_alerting.keys(): epoch_time = time.time() - if epoch_time - process_under_alerting[process][0] >= ALERTING_INTERVAL_SECS: - process_under_alerting[process][0] = epoch_time - process_under_alerting[process][1] += 1 - generate_alerting_message(process, process_under_alerting[process][1]) + elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] + if elapsed_secs >= ALERTING_INTERVAL_SECS: + elapsed_mins = int(elapsed_secs / 60) + process_under_alerting[process_name]["last_alerted"] = epoch_time + process_under_alerting[process_name]["dead_minutes"] += elapsed_mins + generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"]) if __name__ == "__main__": From d84edd5bc6a30e55cbd207b7b01000921d00f838 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 24 Feb 2021 15:16:02 -0800 Subject: [PATCH 3/3] [event listener] Use floor division operator --- files/scripts/supervisor-proc-exit-listener | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 432c81e7a700..7bf3059b5e9a 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -169,7 +169,7 @@ def main(argv): epoch_time = time.time() elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"] if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = int(elapsed_secs / 60) + elapsed_mins = elapsed_secs // 60 process_under_alerting[process_name]["last_alerted"] = epoch_time process_under_alerting[process_name]["dead_minutes"] += elapsed_mins generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])