Skip to content

Commit

Permalink
Update the advanced reboot test to collect tcpdump on the server phys…
Browse files Browse the repository at this point in the history
…ical port

Get the tcpdump on the physical interface of the test server instead of the logical ptf interfaces. This is to fix the issue that sometimes there could be random packet drop on the ptf.
  • Loading branch information
congh-nvidia committed Nov 12, 2024
1 parent f7a4584 commit 13d2ec3
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 87 deletions.
27 changes: 27 additions & 0 deletions ansible/roles/test/files/ptftests/device_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,30 @@ def execCommand(self, cmd, timeout=DEFAULT_CMD_EXECUTION_TIMEOUT_SEC):
client.close()

return stdOut, stdErr, retValue

@retry(
stop_max_attempt_number=2,
retry_on_exception=lambda e: isinstance(e, AuthenticationException)
)
def fetch(self, remote_path, local_path):
"""
Fetch the file from the remote device
@param remote_path: the full path of the file to fetch
@param local_path: the full path of the file to be saved locally
"""
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
client.connect(self.hostname, username=self.username, password=self.password, allow_agent=False)
ftp_client = client.open_sftp()
ftp_client.get(remote_path, local_path)
ftp_client.close()
except AuthenticationException as authenticationException:
logger.error('SSH Authentication failure with message: %s' %
authenticationException)
if self.alt_password is not None:
# attempt retry with alt_password
self.password = self.alt_password
raise AuthenticationException
finally:
client.close()
105 changes: 27 additions & 78 deletions ansible/roles/test/files/ptftests/py3/advanced-reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ def __init__(self):
self.check_param('dut_username', '', required=True)
self.check_param('dut_password', '', required=True)
self.check_param('dut_hostname', '', required=True)
self.check_param('vmhost_username', '', required=True)
self.check_param('vmhost_password', '', required=True)
self.check_param('vmhost_mgmt_ip', '', required=True)
self.check_param('vmhost_external_port', '', required=True)
self.check_param('reboot_limit_in_seconds', 30, required=False)
self.check_param('reboot_type', 'fast-reboot', required=False)
self.check_param('graceful_limit', 240, required=False)
Expand Down Expand Up @@ -275,6 +279,12 @@ def __init__(self):
alt_password=self.test_params.get('alt_password')
)

self.vmhost_connection = DeviceConnection(
self.test_params['vmhost_mgmt_ip'],
self.test_params['vmhost_username'],
password=self.test_params['vmhost_password']
)

self.sender_thr = threading.Thread(target=self.send_in_background)
self.sniff_thr = threading.Thread(target=self.sniff_in_background)

Expand Down Expand Up @@ -754,6 +764,8 @@ def tearDown(self):

self.log("Disabling arp_responder")
self.cmd(["supervisorctl", "stop", "arp_responder"])
self.log("Remove the tcpdump pcap on the vm host.")
self.vmhost_connection.execCommand(f"sudo rm -rf {self.remote_capture_pcap}")

# Stop watching DUT
self.watching = False
Expand Down Expand Up @@ -1750,7 +1762,7 @@ def sniff_in_background(self, wait=None):
'wait': wait, 'sniff_filter': sniff_filter})
sniffer.start()
# Let the scapy sniff initialize completely.
time.sleep(2)
time.sleep(10)
# Unblock waiter for the send_in_background.
self.sniffer_started.set()
sniffer.join()
Expand All @@ -1760,18 +1772,21 @@ def sniff_in_background(self, wait=None):

def tcpdump_sniff(self, wait=300, sniff_filter=''):
"""
@summary: PTF runner - runs a sniffer in PTF container.
@summary: PTF runner - runs a sniffer in vmhost(server).
Args:
wait (int): Duration in seconds to sniff the traffic
sniff_filter (str): Filter that tcpdump will use to collect only relevant packets
"""
try:
capture_pcap = ("/tmp/capture_%s.pcap" % self.logfile_suffix
if self.logfile_suffix is not None else "/tmp/capture.pcap")
subprocess.call(["rm", "-rf", capture_pcap]) # remove old capture
remote_capture_pcap = capture_pcap + f"_{self.test_params['dut_hostname']}"
self.remote_capture_pcap = remote_capture_pcap
self.vmhost_connection.execCommand(f"sudo rm -rf {remote_capture_pcap}")
subprocess.call(["rm", "-rf", capture_pcap])
self.kill_sniffer = False
self.start_sniffer(capture_pcap, sniff_filter, wait)
self.create_single_pcap(capture_pcap)
self.start_sniffer(remote_capture_pcap, sniff_filter, wait)
self.vmhost_connection.fetch(remote_capture_pcap, capture_pcap)
self.packets = scapyall.rdpcap(capture_pcap)
self.log("Number of all packets captured: {}".format(len(self.packets)))
except Exception:
Expand All @@ -1782,14 +1797,10 @@ def start_sniffer(self, pcap_path, tcpdump_filter, timeout):
"""
Start tcpdump sniffer on all data interfaces, and kill them after a specified timeout
"""
self.tcpdump_data_ifaces = [
iface for iface in scapyall.get_if_list() if iface.startswith('eth')]
processes_list = []
for iface in self.tcpdump_data_ifaces:
iface_pcap_path = '{}_{}'.format(pcap_path, iface)
process = subprocess.Popen(['tcpdump', '-i', iface, tcpdump_filter, '-w', iface_pcap_path])
self.log('Tcpdump sniffer starting on iface: {}'.format(iface))
processes_list.append(process)
interface = self.test_params['vmhost_external_port']
cmd = f"sudo nohup tcpdump -i {interface} {tcpdump_filter} -w {pcap_path}"
self.vmhost_connection.execCommand(cmd)
self.log(f'Tcpdump sniffer starting on vmhost interface: {interface}')

time_start = time.time()
while not self.kill_sniffer:
Expand All @@ -1799,71 +1810,9 @@ def start_sniffer(self, pcap_path, tcpdump_filter, timeout):
break
time_start = curr_time

self.log("Going to kill all tcpdump processes by SIGTERM")
for process in processes_list:
process.terminate()

for process in processes_list:
process.wait(timeout=5)
# Return code here could be 0, so we need to explicitly check for None
if process.returncode is not None:
self.log("Tcpdump process {} terminated".format(process.args))

for process in processes_list:
if process.returncode is not None:
continue
self.log("Killing tcpdump process {}".format(process.args))
process.kill()
process.wait(timeout=5)
# Return code here could be 0, so we need to explicitly check for None
if process.returncode is not None:
self.log("Tcpdump process {} killed".format(process.args))

self.log("Killed all tcpdump processes")

def create_single_pcap(self, pcap_path):
"""
Merge all pcaps from each interface into single pcap file
"""
pcapng_full_capture = self.merge_pcaps(
pcap_path, self.tcpdump_data_ifaces)
self.convert_pcapng_to_pcap(pcap_path, pcapng_full_capture)
self.log('Pcap files merged into single pcap file: {}'.format(pcap_path))

def merge_pcaps(self, pcap_path, data_ifaces):
"""
Merge all pcaps into one, format: pcapng
"""
pcapng_full_capture = '{}.pcapng'.format(pcap_path)
cmd = ['mergecap', '-w', pcapng_full_capture]
ifaces_pcap_files_list = []
for iface in data_ifaces:
pcap_file_path = '{}_{}'.format(pcap_path, iface)
if os.path.exists(pcap_file_path):
cmd.append(pcap_file_path)
ifaces_pcap_files_list.append(pcap_file_path)

self.log('Starting merge pcap files')
subprocess.call(cmd)
self.log('Pcap files merged into tmp pcapng file')

# Remove pcap files created per interface
for pcap_file in ifaces_pcap_files_list:
subprocess.call(['rm', '-f', pcap_file])

return pcapng_full_capture

def convert_pcapng_to_pcap(self, pcap_path, pcapng_full_capture):
"""
Convert pcapng file into pcap. We can't just merge all in pcap,
mergecap can merge multiple files only into pcapng format
"""
cmd = ['mergecap', '-F', 'pcap', '-w', pcap_path, pcapng_full_capture]
self.log('Converting pcapng file into pcap file')
subprocess.call(cmd)
self.log('Pcapng file converted into pcap file')
# Remove tmp pcapng file
subprocess.call(['rm', '-f', pcapng_full_capture])
self.log("Going to kill the tcpdump process by SIGTERM")
self.vmhost_connection.execCommand(f'sudo pkill -f "{cmd}"')
self.log("Killed the tcpdump process")

def check_tcp_payload(self, packet):
"""
Expand Down
20 changes: 16 additions & 4 deletions tests/common/fixtures/advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class AdvancedReboot:
Test cases can trigger test start utilizing runRebootTestcase API.
"""

def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs):
def __init__(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs):
"""
Class constructor.
@param request: pytest request object
Expand Down Expand Up @@ -85,6 +85,7 @@ def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds
self.duthost = duthost
self.ptfhost = ptfhost
self.localhost = localhost
self.vmhost = vmhost
self.tbinfo = tbinfo
self.creds = creds
self.moduleIgnoreErrors = kwargs["allow_fail"] if "allow_fail" in kwargs else False
Expand Down Expand Up @@ -185,6 +186,13 @@ def __buildTestbedData(self, tbinfo):
if attr['hwsku'] == 'Arista-VM'
]

self.rebootData['vmhost_mgmt_ip'] = self.vmhost.mgmt_ip
self.rebootData['vmhost_external_port'] = self.vmhost.external_port
self.rebootData['vmhost_username'] = \
self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_user']
self.rebootData['vmhost_password'] = \
self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_password']

self.hostMaxLen = len(self.rebootData['arista_vms']) - 1
self.lagMemberCnt = len(list(self.mgFacts['minigraph_portchannels'].values())[0]['members'])
self.vlanMaxCnt = len(list(self.mgFacts['minigraph_vlans'].values())[0]['members']) - 1
Expand Down Expand Up @@ -707,6 +715,10 @@ def __runPtfRunner(self, rebootOper=None):
"dut_username": self.rebootData['dut_username'],
"dut_password": self.rebootData['dut_password'],
"dut_hostname": self.rebootData['dut_hostname'],
"vmhost_username": self.rebootData['vmhost_username'],
"vmhost_password": self.rebootData['vmhost_password'],
"vmhost_mgmt_ip": self.rebootData['vmhost_mgmt_ip'],
"vmhost_external_port": self.rebootData['vmhost_external_port'],
"reboot_limit_in_seconds": self.rebootLimit,
"reboot_type": self.rebootType,
"other_vendor_flag": self.other_vendor_nos,
Expand Down Expand Up @@ -875,8 +887,8 @@ def tearDown(self):


@pytest.fixture
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo,
creds):
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, vmhost,
tbinfo, creds):
"""
Pytest test fixture that provides access to AdvancedReboot test fixture
@param request: pytest request object
Expand All @@ -893,7 +905,7 @@ def get_advanced_reboot(**kwargs):
API that returns instances of AdvancedReboot class
"""
assert len(instances) == 0, "Only one instance of reboot data is allowed"
advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs)
advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs)
instances.append(advancedReboot)
return advancedReboot

Expand Down
10 changes: 5 additions & 5 deletions tests/platform_tests/test_cont_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def create_test_report(self):
pytest_assert(self.test_failures == 0, "Continuous reboot test failed {}/{} times".
format(self.test_failures, self.reboot_count))

def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds):
def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds):
self.test_set_up()
# Start continuous warm/fast reboot on the DUT
for count in range(self.continuous_reboot_count):
Expand All @@ -306,8 +306,8 @@ def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost
.format(self.reboot_count, self.continuous_reboot_count, self.reboot_type))
reboot_type = self.reboot_type + "-reboot"
try:
self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds,
rebootType=reboot_type, moduleIgnoreErrors=True)
self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo,
creds, rebootType=reboot_type, moduleIgnoreErrors=True)
except Exception:
self.sub_test_result = False
self.test_failures = self.test_failures + 1
Expand Down Expand Up @@ -355,7 +355,7 @@ def test_teardown(self):

@pytest.mark.device_type('vs')
def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname,
ptfhost, localhost, conn_graph_facts, tbinfo, creds):
ptfhost, localhost, vmhost, conn_graph_facts, tbinfo, creds):
"""
@summary: This test performs continuous reboot cycles on images that are provided as an input.
Supported parameters for this test can be modified at runtime:
Expand All @@ -380,5 +380,5 @@ def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_h
continuous_reboot = ContinuousReboot(
request, duthost, ptfhost, localhost, conn_graph_facts)
continuous_reboot.start_continuous_reboot(
request, duthosts, duthost, ptfhost, localhost, tbinfo, creds)
request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds)
continuous_reboot.test_teardown()

0 comments on commit 13d2ec3

Please sign in to comment.