From 13d2ec3f76c9339e9c5b7651cef2a1b48847fa24 Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Mon, 2 Sep 2024 17:15:10 +0800 Subject: [PATCH] Update the advanced reboot test to collect tcpdump on the server physical port Get the tcpdump on the physical interface of the test server instead of the logical ptf interfaces. This is to fix the issue that sometimes there could be random packet drop on the ptf. --- .../test/files/ptftests/device_connection.py | 27 +++++ .../files/ptftests/py3/advanced-reboot.py | 105 +++++------------- tests/common/fixtures/advanced_reboot.py | 20 +++- tests/platform_tests/test_cont_warm_reboot.py | 10 +- 4 files changed, 75 insertions(+), 87 deletions(-) diff --git a/ansible/roles/test/files/ptftests/device_connection.py b/ansible/roles/test/files/ptftests/device_connection.py index a331210d422..19ad85ebb59 100644 --- a/ansible/roles/test/files/ptftests/device_connection.py +++ b/ansible/roles/test/files/ptftests/device_connection.py @@ -93,3 +93,30 @@ def execCommand(self, cmd, timeout=DEFAULT_CMD_EXECUTION_TIMEOUT_SEC): client.close() return stdOut, stdErr, retValue + + @retry( + stop_max_attempt_number=2, + retry_on_exception=lambda e: isinstance(e, AuthenticationException) + ) + def fetch(self, remote_path, local_path): + """ + Fetch the file from the remote device + @param remote_path: the full path of the file to fetch + @param local_path: the full path of the file to be saved locally + """ + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + try: + client.connect(self.hostname, username=self.username, password=self.password, allow_agent=False) + ftp_client = client.open_sftp() + ftp_client.get(remote_path, local_path) + ftp_client.close() + except AuthenticationException as authenticationException: + logger.error('SSH Authentication failure with message: %s' % + authenticationException) + if self.alt_password is not None: + # attempt retry with alt_password + self.password = self.alt_password + raise AuthenticationException + finally: + client.close() diff --git a/ansible/roles/test/files/ptftests/py3/advanced-reboot.py b/ansible/roles/test/files/ptftests/py3/advanced-reboot.py index 3faa038dc89..857058ac71a 100644 --- a/ansible/roles/test/files/ptftests/py3/advanced-reboot.py +++ b/ansible/roles/test/files/ptftests/py3/advanced-reboot.py @@ -145,6 +145,10 @@ def __init__(self): self.check_param('dut_username', '', required=True) self.check_param('dut_password', '', required=True) self.check_param('dut_hostname', '', required=True) + self.check_param('vmhost_username', '', required=True) + self.check_param('vmhost_password', '', required=True) + self.check_param('vmhost_mgmt_ip', '', required=True) + self.check_param('vmhost_external_port', '', required=True) self.check_param('reboot_limit_in_seconds', 30, required=False) self.check_param('reboot_type', 'fast-reboot', required=False) self.check_param('graceful_limit', 240, required=False) @@ -275,6 +279,12 @@ def __init__(self): alt_password=self.test_params.get('alt_password') ) + self.vmhost_connection = DeviceConnection( + self.test_params['vmhost_mgmt_ip'], + self.test_params['vmhost_username'], + password=self.test_params['vmhost_password'] + ) + self.sender_thr = threading.Thread(target=self.send_in_background) self.sniff_thr = threading.Thread(target=self.sniff_in_background) @@ -754,6 +764,8 @@ def tearDown(self): self.log("Disabling arp_responder") self.cmd(["supervisorctl", "stop", "arp_responder"]) + self.log("Remove the tcpdump pcap on the vm host.") + self.vmhost_connection.execCommand(f"sudo rm -rf {self.remote_capture_pcap}") # Stop watching DUT self.watching = False @@ -1750,7 +1762,7 @@ def sniff_in_background(self, wait=None): 'wait': wait, 'sniff_filter': sniff_filter}) sniffer.start() # Let the scapy sniff initialize completely. - time.sleep(2) + time.sleep(10) # Unblock waiter for the send_in_background. self.sniffer_started.set() sniffer.join() @@ -1760,7 +1772,7 @@ def sniff_in_background(self, wait=None): def tcpdump_sniff(self, wait=300, sniff_filter=''): """ - @summary: PTF runner - runs a sniffer in PTF container. + @summary: PTF runner - runs a sniffer in vmhost(server). Args: wait (int): Duration in seconds to sniff the traffic sniff_filter (str): Filter that tcpdump will use to collect only relevant packets @@ -1768,10 +1780,13 @@ def tcpdump_sniff(self, wait=300, sniff_filter=''): try: capture_pcap = ("/tmp/capture_%s.pcap" % self.logfile_suffix if self.logfile_suffix is not None else "/tmp/capture.pcap") - subprocess.call(["rm", "-rf", capture_pcap]) # remove old capture + remote_capture_pcap = capture_pcap + f"_{self.test_params['dut_hostname']}" + self.remote_capture_pcap = remote_capture_pcap + self.vmhost_connection.execCommand(f"sudo rm -rf {remote_capture_pcap}") + subprocess.call(["rm", "-rf", capture_pcap]) self.kill_sniffer = False - self.start_sniffer(capture_pcap, sniff_filter, wait) - self.create_single_pcap(capture_pcap) + self.start_sniffer(remote_capture_pcap, sniff_filter, wait) + self.vmhost_connection.fetch(remote_capture_pcap, capture_pcap) self.packets = scapyall.rdpcap(capture_pcap) self.log("Number of all packets captured: {}".format(len(self.packets))) except Exception: @@ -1782,14 +1797,10 @@ def start_sniffer(self, pcap_path, tcpdump_filter, timeout): """ Start tcpdump sniffer on all data interfaces, and kill them after a specified timeout """ - self.tcpdump_data_ifaces = [ - iface for iface in scapyall.get_if_list() if iface.startswith('eth')] - processes_list = [] - for iface in self.tcpdump_data_ifaces: - iface_pcap_path = '{}_{}'.format(pcap_path, iface) - process = subprocess.Popen(['tcpdump', '-i', iface, tcpdump_filter, '-w', iface_pcap_path]) - self.log('Tcpdump sniffer starting on iface: {}'.format(iface)) - processes_list.append(process) + interface = self.test_params['vmhost_external_port'] + cmd = f"sudo nohup tcpdump -i {interface} {tcpdump_filter} -w {pcap_path}" + self.vmhost_connection.execCommand(cmd) + self.log(f'Tcpdump sniffer starting on vmhost interface: {interface}') time_start = time.time() while not self.kill_sniffer: @@ -1799,71 +1810,9 @@ def start_sniffer(self, pcap_path, tcpdump_filter, timeout): break time_start = curr_time - self.log("Going to kill all tcpdump processes by SIGTERM") - for process in processes_list: - process.terminate() - - for process in processes_list: - process.wait(timeout=5) - # Return code here could be 0, so we need to explicitly check for None - if process.returncode is not None: - self.log("Tcpdump process {} terminated".format(process.args)) - - for process in processes_list: - if process.returncode is not None: - continue - self.log("Killing tcpdump process {}".format(process.args)) - process.kill() - process.wait(timeout=5) - # Return code here could be 0, so we need to explicitly check for None - if process.returncode is not None: - self.log("Tcpdump process {} killed".format(process.args)) - - self.log("Killed all tcpdump processes") - - def create_single_pcap(self, pcap_path): - """ - Merge all pcaps from each interface into single pcap file - """ - pcapng_full_capture = self.merge_pcaps( - pcap_path, self.tcpdump_data_ifaces) - self.convert_pcapng_to_pcap(pcap_path, pcapng_full_capture) - self.log('Pcap files merged into single pcap file: {}'.format(pcap_path)) - - def merge_pcaps(self, pcap_path, data_ifaces): - """ - Merge all pcaps into one, format: pcapng - """ - pcapng_full_capture = '{}.pcapng'.format(pcap_path) - cmd = ['mergecap', '-w', pcapng_full_capture] - ifaces_pcap_files_list = [] - for iface in data_ifaces: - pcap_file_path = '{}_{}'.format(pcap_path, iface) - if os.path.exists(pcap_file_path): - cmd.append(pcap_file_path) - ifaces_pcap_files_list.append(pcap_file_path) - - self.log('Starting merge pcap files') - subprocess.call(cmd) - self.log('Pcap files merged into tmp pcapng file') - - # Remove pcap files created per interface - for pcap_file in ifaces_pcap_files_list: - subprocess.call(['rm', '-f', pcap_file]) - - return pcapng_full_capture - - def convert_pcapng_to_pcap(self, pcap_path, pcapng_full_capture): - """ - Convert pcapng file into pcap. We can't just merge all in pcap, - mergecap can merge multiple files only into pcapng format - """ - cmd = ['mergecap', '-F', 'pcap', '-w', pcap_path, pcapng_full_capture] - self.log('Converting pcapng file into pcap file') - subprocess.call(cmd) - self.log('Pcapng file converted into pcap file') - # Remove tmp pcapng file - subprocess.call(['rm', '-f', pcapng_full_capture]) + self.log("Going to kill the tcpdump process by SIGTERM") + self.vmhost_connection.execCommand(f'sudo pkill -f "{cmd}"') + self.log("Killed the tcpdump process") def check_tcp_payload(self, packet): """ diff --git a/tests/common/fixtures/advanced_reboot.py b/tests/common/fixtures/advanced_reboot.py index 74b4d4827b7..585c1efa3e8 100644 --- a/tests/common/fixtures/advanced_reboot.py +++ b/tests/common/fixtures/advanced_reboot.py @@ -40,7 +40,7 @@ class AdvancedReboot: Test cases can trigger test start utilizing runRebootTestcase API. """ - def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs): + def __init__(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs): """ Class constructor. @param request: pytest request object @@ -85,6 +85,7 @@ def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds self.duthost = duthost self.ptfhost = ptfhost self.localhost = localhost + self.vmhost = vmhost self.tbinfo = tbinfo self.creds = creds self.moduleIgnoreErrors = kwargs["allow_fail"] if "allow_fail" in kwargs else False @@ -185,6 +186,13 @@ def __buildTestbedData(self, tbinfo): if attr['hwsku'] == 'Arista-VM' ] + self.rebootData['vmhost_mgmt_ip'] = self.vmhost.mgmt_ip + self.rebootData['vmhost_external_port'] = self.vmhost.external_port + self.rebootData['vmhost_username'] = \ + self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_user'] + self.rebootData['vmhost_password'] = \ + self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_password'] + self.hostMaxLen = len(self.rebootData['arista_vms']) - 1 self.lagMemberCnt = len(list(self.mgFacts['minigraph_portchannels'].values())[0]['members']) self.vlanMaxCnt = len(list(self.mgFacts['minigraph_vlans'].values())[0]['members']) - 1 @@ -707,6 +715,10 @@ def __runPtfRunner(self, rebootOper=None): "dut_username": self.rebootData['dut_username'], "dut_password": self.rebootData['dut_password'], "dut_hostname": self.rebootData['dut_hostname'], + "vmhost_username": self.rebootData['vmhost_username'], + "vmhost_password": self.rebootData['vmhost_password'], + "vmhost_mgmt_ip": self.rebootData['vmhost_mgmt_ip'], + "vmhost_external_port": self.rebootData['vmhost_external_port'], "reboot_limit_in_seconds": self.rebootLimit, "reboot_type": self.rebootType, "other_vendor_flag": self.other_vendor_nos, @@ -875,8 +887,8 @@ def tearDown(self): @pytest.fixture -def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, - creds): +def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, vmhost, + tbinfo, creds): """ Pytest test fixture that provides access to AdvancedReboot test fixture @param request: pytest request object @@ -893,7 +905,7 @@ def get_advanced_reboot(**kwargs): API that returns instances of AdvancedReboot class """ assert len(instances) == 0, "Only one instance of reboot data is allowed" - advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs) + advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs) instances.append(advancedReboot) return advancedReboot diff --git a/tests/platform_tests/test_cont_warm_reboot.py b/tests/platform_tests/test_cont_warm_reboot.py index c08d85c0a92..ca7443e375e 100644 --- a/tests/platform_tests/test_cont_warm_reboot.py +++ b/tests/platform_tests/test_cont_warm_reboot.py @@ -293,7 +293,7 @@ def create_test_report(self): pytest_assert(self.test_failures == 0, "Continuous reboot test failed {}/{} times". format(self.test_failures, self.reboot_count)) - def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds): + def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds): self.test_set_up() # Start continuous warm/fast reboot on the DUT for count in range(self.continuous_reboot_count): @@ -306,8 +306,8 @@ def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost .format(self.reboot_count, self.continuous_reboot_count, self.reboot_type)) reboot_type = self.reboot_type + "-reboot" try: - self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, - rebootType=reboot_type, moduleIgnoreErrors=True) + self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, + creds, rebootType=reboot_type, moduleIgnoreErrors=True) except Exception: self.sub_test_result = False self.test_failures = self.test_failures + 1 @@ -355,7 +355,7 @@ def test_teardown(self): @pytest.mark.device_type('vs') def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, - ptfhost, localhost, conn_graph_facts, tbinfo, creds): + ptfhost, localhost, vmhost, conn_graph_facts, tbinfo, creds): """ @summary: This test performs continuous reboot cycles on images that are provided as an input. Supported parameters for this test can be modified at runtime: @@ -380,5 +380,5 @@ def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_h continuous_reboot = ContinuousReboot( request, duthost, ptfhost, localhost, conn_graph_facts) continuous_reboot.start_continuous_reboot( - request, duthosts, duthost, ptfhost, localhost, tbinfo, creds) + request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds) continuous_reboot.test_teardown()