Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix multiple failures in KillProcess test on KVM. #16303

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
10 changes: 9 additions & 1 deletion tests/common/devices/sonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,15 @@ def is_host_service_running(self, service):
@param service: Service name
@return: True if specified service is running, else False
"""
service_status = self.shell("sudo systemctl status {} | grep 'Active'".format(service))
try:
service_status = self.shell("sudo systemctl status {} | grep 'Active'".format(service))
hdwhdw marked this conversation as resolved.
Show resolved Hide resolved
except RunAnsibleModuleFail as e:
# If the services does not exist, systemd will output
# "Unit <service> could not be found." with a nonzero return code
# We want to catch the error here.
if 'could not be found' in e.results['stderr']:
return False
raise
return "active (running)" in service_status['stdout']

def critical_services_status(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,12 +1032,6 @@ gnmi/test_gnmi_configdb.py:
- "'t2' in topo_name"
- "is_multi_asic==True"

gnmi/test_gnoi_killprocess.py:
skip:
reason: "There is some issues running on kvm testbeds."
conditions:
- "asic_type in ['vs'] and https://github.com/sonic-net/sonic-mgmt/issues/16238"

#######################################
##### hash #####
#######################################
Expand Down
29 changes: 20 additions & 9 deletions tests/gnmi/test_gnoi_killprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .helper import gnoi_request
from tests.common.helpers.assertions import pytest_assert
from tests.common.helpers.dut_utils import is_container_running

from tests.common.platform.processes_utils import wait_critical_processes

pytestmark = [
pytest.mark.topology('any')
Expand All @@ -15,23 +15,33 @@
("gnmi", False, "Dbus does not support gnmi service management"),
("nonexistent", False, "Dbus does not support nonexistent service management"),
("", False, "Dbus stop_service called with no service specified"),
("snmp", True, ""),
Copy link
Contributor

@qiluo-msft qiluo-msft Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snmp

It should be a valid reqirement to kill snmp cotainer. why remove? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is indeed valid. Restarting snmp container will produce an error log due to this line:
https://github.com/sonic-net/sonic-buildimage/blob/39e2131a7b76f6c3d5257b7e02c540dd33a24d5b/files/build_templates/docker_image_ctl.j2#L114

{%- elif docker_container_name == "snmp" %}
    $SONIC_DB_CLI STATE_DB HSET 'DEVICE_METADATA|localhost' chassis_serial_number $(decode-syseeprom -s)

Because

sudo decode-syseeprom -s
Failed to read system EEPROM info

I think this is known:

# For kvm testbed, command `show platform syseeprom` will return the expected Error

This will also cause a similar issue when killing pmon (I think this is due to "missing sonic_platform module".)

So for now let's just skip these two for vs platform. I don't think this affect our ability to quality the KillProcess implementation.

("swss", True, ""),
("dhcp_relay", True, ""),
("radv", True, ""),
("restapi", True, ""),
("lldp", True, ""),
("sshd", True, ""),
("swss", True, ""),
Copy link
Contributor

@qiluo-msft qiluo-msft Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

swss

what is wrong to kill swss? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out it is because the test wasn't written correctly: We need to explicitly wait for critical processes to start after killing swss. Looks like killing and restarting swss will make a lot of other processes restart, and if we don't wait and immediately start the next testcase, it will generate some swss error (in the next testcase).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed after adding a wait for critical process back.

("pmon", True, ""),
("rsyslog", True, ""),
("telemetry", True, "")
("telemetry", True, ""),
("snmp", True, ""),
])
def test_gnoi_killprocess_then_restart(duthosts, rand_one_dut_hostname, localhost, process, is_valid, expected_msg):
duthost = duthosts[rand_one_dut_hostname]

if process and process != "nonexistent":
pytest_assert(duthost.is_host_service_running(process),
"{} should be running before KillProcess test attempts to kill this process".format(process))
if process and not duthost.is_host_service_running(process):
pytest.skip("{} is not running".format(process))

if duthost.facts["asic_type"] == "vs" and process == "pmon":
# killing pmon in kvm will produce expected error due to missing sonic_platform module.
# pmon#chassis_db_init: Failed to load chassis due to ModuleNotFoundError("No module named 'sonic_platform'")
pytest.skip("killing pmon in kvm will produce expected error due to missing sonic_platform module.")

if duthost.facts["asic_type"] == "vs" and process == "snmp":
# killing snmp in kvm will produce expected error on snmp startup. See:
# admin@vlab-01:~$ sudo decode-syseeprom -s
# Failed to read system EEPROM info
pytest.skip("killing snmp in kvm will produce expected error due to missing sonic_platform module.")

request_kill_json_data = '{{"name": "{}", "signal": 1}}'.format(process)
ret, msg = gnoi_request(duthost, localhost, "KillProcess", request_kill_json_data)
Expand All @@ -49,7 +59,7 @@ def test_gnoi_killprocess_then_restart(duthosts, rand_one_dut_hostname, localhos
else:
pytest_assert(ret != 0, "KillProcess API unexpectedly succeeded with invalid request parameters")
pytest_assert(expected_msg in msg, "Unexpected error message in response to invalid gNOI request")

wait_critical_processes(duthost)
pytest_assert(duthost.critical_services_fully_started, "System unhealthy after gNOI API request")


Expand All @@ -70,15 +80,16 @@ def test_gnoi_killprocess_restart(duthosts, rand_one_dut_hostname, localhost, re
else:
pytest_assert(ret != 0, "KillProcess API unexpectedly succeeded with invalid request parameters")
pytest_assert("panic" in msg, "Unexpected error message in response to invalid gNOI request")
wait_critical_processes(duthost)
pytest_assert(duthost.critical_services_fully_started, "System unhealthy after gNOI API request")


def test_invalid_signal(duthosts, rand_one_dut_hostname, localhost):
duthost = duthosts[rand_one_dut_hostname]
request_json_data = '{"name": "snmp", "restart": true, "signal": 2}'
ret, msg = gnoi_request(duthost, localhost, "KillProcess", request_json_data)

pytest_assert(ret != 0, "KillProcess API unexpectedly succeeded with invalid request parameters")
pytest_assert("KillProcess only supports SIGNAL_TERM (option 1)" in msg,
"Unexpected error message in response to invalid gNOI request")
wait_critical_processes(duthost)
pytest_assert(duthost.critical_services_fully_started, "System unhealthy after gNOI API request")
Loading