Skip to content

Commit

Permalink
Disable routeCheck monit as part of config reload/minigraph stop serv…
Browse files Browse the repository at this point in the history
…ice and enable it back as part of service start. (#3682)

What I did:

For Config reload/minigraph stop and re-enable routeCheck because
with large route scale of 70K+ routes this can log monit error
Transiently which can result in failure of sonic-mgmt test cases because of loganalyzer.

Why I did:

Because of this transient issue monit ERR log can get generated and this can result failure of sonic-mgmt test case.

How I verify:

Manual Verification via sudo monit status routeCheck and UT updated.
  • Loading branch information
abdosi authored and mssonicbld committed Dec 18, 2024
1 parent 3561996 commit f3e75c0
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
14 changes: 8 additions & 6 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,8 +886,9 @@ def _get_disabled_services_list(config_db):
def _stop_services():
try:
subprocess.check_call(['sudo', 'monit', 'status'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
click.echo("Disabling container monitoring ...")
click.echo("Disabling container and routeCheck monitoring ...")
clicommon.run_command(['sudo', 'monit', 'unmonitor', 'container_checker'])
clicommon.run_command(['sudo', 'monit', 'unmonitor', 'routeCheck'])
except subprocess.CalledProcessError as err:
pass

Expand Down Expand Up @@ -946,17 +947,18 @@ def _restart_services():
wait_service_restart_finish('interfaces-config', last_interface_config_timestamp)
wait_service_restart_finish('networking', last_networking_timestamp)

# Reload Monit configuration to pick up new hostname in case it changed
click.echo("Reloading Monit configuration ...")
clicommon.run_command(['sudo', 'monit', 'reload'])

try:
subprocess.check_call(['sudo', 'monit', 'status'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
click.echo("Enabling container monitoring ...")
click.echo("Enabling container and routeCheck monitoring ...")
clicommon.run_command(['sudo', 'monit', 'monitor', 'container_checker'])
clicommon.run_command(['sudo', 'monit', 'monitor', 'routeCheck'])
except subprocess.CalledProcessError as err:
pass

# Reload Monit configuration to pick up new hostname in case it changed
click.echo("Reloading Monit configuration ...")
clicommon.run_command(['sudo', 'monit', 'reload'])

def _per_namespace_swss_ready(service_name):
out, _ = clicommon.run_command(['systemctl', 'show', str(service_name), '--property', 'ActiveState', '--value'], return_cmd=True)
if out.strip() != "active":
Expand Down
7 changes: 5 additions & 2 deletions tests/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@

load_minigraph_command_output="""\
Acquired lock on {0}
Disabling container and routeCheck monitoring ...
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
Running command: config qos reload --no-dynamic-buffer --no-delay
Running command: pfcwd start_default
Restarting SONiC target ...
Reloading Monit configuration ...
Enabling container and routeCheck monitoring ...
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
Released lock on {0}
"""
Expand Down Expand Up @@ -917,7 +919,8 @@ def setup_class(cls):
importlib.reload(config.main)

@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=("dummy_path", None)))
def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
@mock.patch('config.main.subprocess.check_call')
def test_load_minigraph(self, mock_check_call, get_cmd_module, setup_single_broadcom_asic):
with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
(config, show) = get_cmd_module
runner = CliRunner()
Expand All @@ -930,7 +933,7 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
(load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK))
# Verify "systemctl reset-failed" is called for services under sonic.target
mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
assert mock_run_command.call_count == 12
assert mock_run_command.call_count == 16

@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs',
mock.MagicMock(return_value=("dummy_path", None)))
Expand Down

0 comments on commit f3e75c0

Please sign in to comment.