From 14cb6db111c86fb3247e11425a9a1cdc14f71855 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Thu, 28 Sep 2023 02:09:43 +0000 Subject: [PATCH 1/3] Fix supervisor-proc-exit-listener false alert during warm reboot issue. --- files/scripts/supervisor-proc-exit-listener | 29 +++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 61c12d8ce45e..3a075c7e0189 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -73,6 +73,23 @@ def get_group_and_process_list(process_file): return group_list, process_list +def is_warm_reboot(): + """ + Checks if a warm reboot is going on + """ + try: + state_db_connector = swsscommon.DBConnector(swsscommon.STATE_DB, 0) + tbl = swsscommon.Table(state_db_connector, 'WARM_RESTART_ENABLE_TABLE') + + (status, value) = tbl.hget('system', 'enable') + if status and value == 'true': + return True + except RuntimeError as e: + syslog.syslog(syslog.LOG_ERR, "Check warm reboot status failed: {}".format(e)) + + return False + + def generate_alerting_message(process_name, status, dead_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host @@ -86,8 +103,16 @@ def generate_alerting_message(process_name, status, dead_minutes): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." - .format(process_name, status, namespace, dead_minutes)) + message = "Process '{}' is {} in namespace '{}' ({} minutes).".format( + process_name, + status, + namespace, + dead_minutes) + if is_warm_reboot(): + syslog.syslog(syslog.LOG_INFO, "Warm rebooting, {}".format(message)) + return + + syslog.syslog(syslog.LOG_ERR, message) def get_autorestart_state(container_name, use_unix_socket_path): From 5a8d81b4ace154f35e90fd4678921cd3490c9d26 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Thu, 28 Sep 2023 06:20:00 +0000 Subject: [PATCH 2/3] Fix code issue --- files/scripts/supervisor-proc-exit-listener | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 3a075c7e0189..ee7f3c83df49 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -78,15 +78,13 @@ def is_warm_reboot(): Checks if a warm reboot is going on """ try: - state_db_connector = swsscommon.DBConnector(swsscommon.STATE_DB, 0) + state_db_connector = swsscommon.DBConnector("STATE_DB", 0) tbl = swsscommon.Table(state_db_connector, 'WARM_RESTART_ENABLE_TABLE') - (status, value) = tbl.hget('system', 'enable') if status and value == 'true': return True except RuntimeError as e: syslog.syslog(syslog.LOG_ERR, "Check warm reboot status failed: {}".format(e)) - return False From 9390287eb9686de7bddb4a6985477d21fd892833 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Sat, 7 Oct 2023 05:16:38 +0000 Subject: [PATCH 3/3] Fix PR according to comments --- files/scripts/supervisor-proc-exit-listener | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index ee7f3c83df49..5eab7847bd83 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -101,16 +101,8 @@ def generate_alerting_message(process_name, status, dead_minutes): else: namespace = namespace_prefix + namespace_id - message = "Process '{}' is {} in namespace '{}' ({} minutes).".format( - process_name, - status, - namespace, - dead_minutes) - if is_warm_reboot(): - syslog.syslog(syslog.LOG_INFO, "Warm rebooting, {}".format(message)) - return - - syslog.syslog(syslog.LOG_ERR, message) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." + .format(process_name, status, namespace, dead_minutes)) def get_autorestart_state(container_name, use_unix_socket_path): @@ -235,6 +227,10 @@ def main(argv): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] if elapsed_secs >= ALERTING_INTERVAL_SECS: + if is_warm_reboot() and process == "orchagent": + # Orchagent will set to frozen during warm reboot. + continue + elapsed_mins = elapsed_secs // 60 generate_alerting_message(process, "stuck", elapsed_mins)