From f3e75c000c1fba444de17d692465246929719214 Mon Sep 17 00:00:00 2001 From: abdosi <58047199+abdosi@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:44:00 +0530 Subject: [PATCH] Disable routeCheck monit as part of config reload/minigraph stop service and enable it back as part of service start. (#3682) What I did: For Config reload/minigraph stop and re-enable routeCheck because with large route scale of 70K+ routes this can log monit error Transiently which can result in failure of sonic-mgmt test cases because of loganalyzer. Why I did: Because of this transient issue monit ERR log can get generated and this can result failure of sonic-mgmt test case. How I verify: Manual Verification via sudo monit status routeCheck and UT updated. --- config/main.py | 14 ++++++++------ tests/config_test.py | 7 +++++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/config/main.py b/config/main.py index 1b48c6df9..5aaec94b1 100644 --- a/config/main.py +++ b/config/main.py @@ -886,8 +886,9 @@ def _get_disabled_services_list(config_db): def _stop_services(): try: subprocess.check_call(['sudo', 'monit', 'status'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - click.echo("Disabling container monitoring ...") + click.echo("Disabling container and routeCheck monitoring ...") clicommon.run_command(['sudo', 'monit', 'unmonitor', 'container_checker']) + clicommon.run_command(['sudo', 'monit', 'unmonitor', 'routeCheck']) except subprocess.CalledProcessError as err: pass @@ -946,17 +947,18 @@ def _restart_services(): wait_service_restart_finish('interfaces-config', last_interface_config_timestamp) wait_service_restart_finish('networking', last_networking_timestamp) + # Reload Monit configuration to pick up new hostname in case it changed + click.echo("Reloading Monit configuration ...") + clicommon.run_command(['sudo', 'monit', 'reload']) + try: subprocess.check_call(['sudo', 'monit', 'status'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - click.echo("Enabling container monitoring ...") + click.echo("Enabling container and routeCheck monitoring ...") clicommon.run_command(['sudo', 'monit', 'monitor', 'container_checker']) + clicommon.run_command(['sudo', 'monit', 'monitor', 'routeCheck']) except subprocess.CalledProcessError as err: pass - # Reload Monit configuration to pick up new hostname in case it changed - click.echo("Reloading Monit configuration ...") - clicommon.run_command(['sudo', 'monit', 'reload']) - def _per_namespace_swss_ready(service_name): out, _ = clicommon.run_command(['systemctl', 'show', str(service_name), '--property', 'ActiveState', '--value'], return_cmd=True) if out.strip() != "active": diff --git a/tests/config_test.py b/tests/config_test.py index 6f538625c..082a4ff47 100644 --- a/tests/config_test.py +++ b/tests/config_test.py @@ -47,12 +47,14 @@ load_minigraph_command_output="""\ Acquired lock on {0} +Disabling container and routeCheck monitoring ... Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: config qos reload --no-dynamic-buffer --no-delay Running command: pfcwd start_default Restarting SONiC target ... Reloading Monit configuration ... +Enabling container and routeCheck monitoring ... Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. Released lock on {0} """ @@ -917,7 +919,8 @@ def setup_class(cls): importlib.reload(config.main) @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=("dummy_path", None))) - def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic): + @mock.patch('config.main.subprocess.check_call') + def test_load_minigraph(self, mock_check_call, get_cmd_module, setup_single_broadcom_asic): with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: (config, show) = get_cmd_module runner = CliRunner() @@ -930,7 +933,7 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic): (load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK)) # Verify "systemctl reset-failed" is called for services under sonic.target mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss']) - assert mock_run_command.call_count == 12 + assert mock_run_command.call_count == 16 @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=("dummy_path", None)))