Skip to content

Commit

Permalink
Update the advanced reboot test to collect tcpdump on the server phys…
Browse files Browse the repository at this point in the history
…ical port

Get the tcpdump on the physical interface of the test server instead of the logical ptf interfaces. This is to fix the issue that sometimes there could be random packet drop on the ptf.
Keep the original approch to capture on the ptf interfaces for the kvm testbed which doesn't have the vmhost external port.
  • Loading branch information
congh-nvidia committed Nov 27, 2024
1 parent f7a4584 commit 961a7df
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 21 deletions.
27 changes: 27 additions & 0 deletions ansible/roles/test/files/ptftests/device_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,30 @@ def execCommand(self, cmd, timeout=DEFAULT_CMD_EXECUTION_TIMEOUT_SEC):
client.close()

return stdOut, stdErr, retValue

@retry(
stop_max_attempt_number=2,
retry_on_exception=lambda e: isinstance(e, AuthenticationException)
)
def fetch(self, remote_path, local_path):
"""
Fetch the file from the remote device
@param remote_path: the full path of the file to fetch
@param local_path: the full path of the file to be saved locally
"""
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
client.connect(self.hostname, username=self.username, password=self.password, allow_agent=False)
ftp_client = client.open_sftp()
ftp_client.get(remote_path, local_path)
ftp_client.close()
except AuthenticationException as authenticationException:
logger.error('SSH Authentication failure with message: %s' %
authenticationException)
if self.alt_password is not None:
# attempt retry with alt_password
self.password = self.alt_password
raise AuthenticationException
finally:
client.close()
86 changes: 75 additions & 11 deletions ansible/roles/test/files/ptftests/py3/advanced-reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ def __init__(self):
self.check_param('dut_username', '', required=True)
self.check_param('dut_password', '', required=True)
self.check_param('dut_hostname', '', required=True)
self.check_param('vmhost_username', '', required=False)
self.check_param('vmhost_password', '', required=False)
self.check_param('vmhost_mgmt_ip', '', required=False)
self.check_param('vmhost_external_port', '', required=False)
self.check_param('reboot_limit_in_seconds', 30, required=False)
self.check_param('reboot_type', 'fast-reboot', required=False)
self.check_param('graceful_limit', 240, required=False)
Expand Down Expand Up @@ -275,8 +279,17 @@ def __init__(self):
alt_password=self.test_params.get('alt_password')
)

self.vmhost_external_port = self.test_params['vmhost_external_port']
if self.vmhost_external_port:
self.vmhost_connection = DeviceConnection(
self.test_params['vmhost_mgmt_ip'],
self.test_params['vmhost_username'],
password=self.test_params['vmhost_password']
)

self.sender_thr = threading.Thread(target=self.send_in_background)
self.sniff_thr = threading.Thread(target=self.sniff_in_background)
self.start_sender_delay = 30

# Check if platform type is kvm
stdout, stderr, return_code = self.dut_connection.execCommand(
Expand Down Expand Up @@ -661,6 +674,15 @@ def setUp(self):
if self.kvm_test:
self.log("This test is for KVM platform")

self.capture_pcap = ("/tmp/capture_%s.pcap" % self.logfile_suffix
if self.logfile_suffix is not None else "/tmp/capture.pcap")
if self.vmhost_external_port:
self.log("Test will collect tcpdump on the vmhost external port")
remote_capture_pcap = self.capture_pcap + f"_{self.test_params['dut_hostname']}"
self.remote_capture_pcap = remote_capture_pcap
self.vmhost_connection.execCommand(f"sudo rm -rf {self.remote_capture_pcap}")
self.log(f"The pcap file on vmhost will be located in {remote_capture_pcap}")

# get VM info
if isinstance(self.test_params['arista_vms'], list):
arista_vms = self.test_params['arista_vms']
Expand Down Expand Up @@ -754,6 +776,9 @@ def tearDown(self):

self.log("Disabling arp_responder")
self.cmd(["supervisorctl", "stop", "arp_responder"])
if self.vmhost_external_port:
self.log("Remove the tcpdump pcap on the vm host.")
self.vmhost_connection.execCommand(f"sudo rm -rf {self.remote_capture_pcap}")

# Stop watching DUT
self.watching = False
Expand Down Expand Up @@ -1681,7 +1706,7 @@ def send_in_background(self, packets_list=None):
"""
if not packets_list:
packets_list = self.packets_list
self.sniffer_started.wait(timeout=10)
self.sniffer_started.wait(timeout=self.start_sender_delay)
with self.dataplane_io_lock:
# While running fast data plane sender thread there are two reasons for filter to be applied
# 1. filter out data plane traffic which is tcp to free up the load
Expand Down Expand Up @@ -1749,8 +1774,22 @@ def sniff_in_background(self, wait=None):
sniffer = threading.Thread(target=self.tcpdump_sniff, kwargs={
'wait': wait, 'sniff_filter': sniff_filter})
sniffer.start()
# Let the scapy sniff initialize completely.
time.sleep(2)
# Let the scapy sniff initialize completely. Need to wait more time when capturing on the vmhost.
base_tcpdump_delay = 2
time.sleep(base_tcpdump_delay)
if self.vmhost_external_port:
elapsed_time = 0
while elapsed_time < self.start_sender_delay - base_tcpdump_delay:
elapsed_time += 1
time.sleep(1)
stdout_lines, stderr_lines, _ = self.vmhost_connection.execCommand(f"ls {self.remote_capture_pcap}")
if (self.remote_capture_pcap + '\n') in stdout_lines and len(stderr_lines) == 0:
self.log(f"The pcap file on the vmhost is created: {self.remote_capture_pcap}")
break
else:
self.log(f"Error: the pcap file on the vmhost is not created in {self.start_sender_delay}s.")
raise Exception("Tcpdump on the vmhost failed to start, test is aborted.")

# Unblock waiter for the send_in_background.
self.sniffer_started.set()
sniffer.join()
Expand All @@ -1760,25 +1799,50 @@ def sniff_in_background(self, wait=None):

def tcpdump_sniff(self, wait=300, sniff_filter=''):
"""
@summary: PTF runner - runs a sniffer in PTF container.
@summary: PTF runner - runs a sniffer in vmhost(server) or the PTF container.
Args:
wait (int): Duration in seconds to sniff the traffic
sniff_filter (str): Filter that tcpdump will use to collect only relevant packets
"""
try:
capture_pcap = ("/tmp/capture_%s.pcap" % self.logfile_suffix
if self.logfile_suffix is not None else "/tmp/capture.pcap")
subprocess.call(["rm", "-rf", capture_pcap]) # remove old capture
subprocess.call(["rm", "-rf", self.capture_pcap])
self.kill_sniffer = False
self.start_sniffer(capture_pcap, sniff_filter, wait)
self.create_single_pcap(capture_pcap)
self.packets = scapyall.rdpcap(capture_pcap)

if self.vmhost_external_port:
self.start_sniffer_on_vmhost(self.remote_capture_pcap, sniff_filter, wait)
self.vmhost_connection.fetch(self.remote_capture_pcap, self.capture_pcap)
else:
self.start_sniffer_on_ptf(self.capture_pcap, sniff_filter, wait)
self.create_single_pcap(self.capture_pcap)

self.packets = scapyall.rdpcap(self.capture_pcap)
self.log("Number of all packets captured: {}".format(len(self.packets)))
except Exception:
traceback_msg = traceback.format_exc()
self.log("Error in tcpdump_sniff: {}".format(traceback_msg))

def start_sniffer(self, pcap_path, tcpdump_filter, timeout):
def start_sniffer_on_vmhost(self, pcap_path, tcpdump_filter, timeout):
"""
Start tcpdump sniffer on all data interfaces, and kill them after a specified timeout
"""
interface = self.test_params['vmhost_external_port']
cmd = f"sudo nohup tcpdump -i {interface} {tcpdump_filter} -w {pcap_path}"
self.vmhost_connection.execCommand(cmd + " > /dev/null 2>&1 &")
self.log(f'Tcpdump sniffer starting on vmhost interface: {interface}')

time_start = time.time()
while not self.kill_sniffer:
time.sleep(1)
curr_time = time.time()
if curr_time - time_start > timeout:
break
time_start = curr_time

self.log("Going to kill the tcpdump process by SIGTERM")
self.vmhost_connection.execCommand(f'sudo pkill -f "{cmd}"')
self.log("Killed the tcpdump process")

def start_sniffer_on_ptf(self, pcap_path, tcpdump_filter, timeout):
"""
Start tcpdump sniffer on all data interfaces, and kill them after a specified timeout
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/common/devices/vmhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ def external_port(self):
vm = self.host.options["variable_manager"]
im = self.host.options["inventory_manager"]
hostvars = vm.get_vars(host=im.get_host(self.hostname), include_delegate_to=False)
setattr(self, "_external_port", hostvars["external_port"])
setattr(self, "_external_port", hostvars.get("external_port", ''))
return getattr(self, "_external_port")
26 changes: 22 additions & 4 deletions tests/common/fixtures/advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class AdvancedReboot:
Test cases can trigger test start utilizing runRebootTestcase API.
"""

def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs):
def __init__(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs):
"""
Class constructor.
@param request: pytest request object
Expand Down Expand Up @@ -85,6 +85,7 @@ def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds
self.duthost = duthost
self.ptfhost = ptfhost
self.localhost = localhost
self.vmhost = vmhost
self.tbinfo = tbinfo
self.creds = creds
self.moduleIgnoreErrors = kwargs["allow_fail"] if "allow_fail" in kwargs else False
Expand All @@ -99,6 +100,7 @@ def __init__(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds
self.lagMemberCnt = 0
self.vlanMaxCnt = 0
self.hostMaxCnt = HOST_MAX_COUNT
self.capture_on_vmhost = True if vmhost.external_port else False
if "dualtor" in self.getTestbedType():
self.dual_tor_mode = True
peer_duthost = get_peerhost(duthosts, duthost)
Expand Down Expand Up @@ -184,6 +186,13 @@ def __buildTestbedData(self, tbinfo):
attr['mgmt_addr'] for dev, attr in list(self.mgFacts['minigraph_devices'].items())
if attr['hwsku'] == 'Arista-VM'
]
if self.capture_on_vmhost:
self.rebootData['vmhost_mgmt_ip'] = self.vmhost.mgmt_ip
self.rebootData['vmhost_external_port'] = self.vmhost.external_port
self.rebootData['vmhost_username'] = \
self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_user']
self.rebootData['vmhost_password'] = \
self.duthost.host.options['variable_manager']._hostvars[self.vmhost.hostname]['vm_host_password']

self.hostMaxLen = len(self.rebootData['arista_vms']) - 1
self.lagMemberCnt = len(list(self.mgFacts['minigraph_portchannels'].values())[0]['members'])
Expand Down Expand Up @@ -736,6 +745,14 @@ def __runPtfRunner(self, rebootOper=None):
"neighbor_type": self.neighborType,
}

if self.capture_on_vmhost:
params.update({
"vmhost_username": self.rebootData['vmhost_username'],
"vmhost_password": self.rebootData['vmhost_password'],
"vmhost_mgmt_ip": self.rebootData['vmhost_mgmt_ip'],
"vmhost_external_port": self.rebootData['vmhost_external_port']
})

if self.dual_tor_mode:
params.update({
"peer_ports_file": self.rebootData['peer_ports_file'],
Expand Down Expand Up @@ -875,15 +892,16 @@ def tearDown(self):


@pytest.fixture
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo,
creds):
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, vmhost,
tbinfo, creds):
"""
Pytest test fixture that provides access to AdvancedReboot test fixture
@param request: pytest request object
@param duthosts: AnsibleHost instance of DUT
@param ptfhost: PTFHost for interacting with PTF through ansible
@param localhost: Localhost for interacting with localhost through ansible
@param tbinfo: fixture provides information about testbed
@param vmhost: AnsibleHost instance of the test server
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
instances = []
Expand All @@ -893,7 +911,7 @@ def get_advanced_reboot(**kwargs):
API that returns instances of AdvancedReboot class
"""
assert len(instances) == 0, "Only one instance of reboot data is allowed"
advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds, **kwargs)
advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds, **kwargs)
instances.append(advancedReboot)
return advancedReboot

Expand Down
10 changes: 5 additions & 5 deletions tests/platform_tests/test_cont_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def create_test_report(self):
pytest_assert(self.test_failures == 0, "Continuous reboot test failed {}/{} times".
format(self.test_failures, self.reboot_count))

def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, tbinfo, creds):
def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds):
self.test_set_up()
# Start continuous warm/fast reboot on the DUT
for count in range(self.continuous_reboot_count):
Expand All @@ -306,8 +306,8 @@ def start_continuous_reboot(self, request, duthosts, duthost, ptfhost, localhost
.format(self.reboot_count, self.continuous_reboot_count, self.reboot_type))
reboot_type = self.reboot_type + "-reboot"
try:
self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, tbinfo, creds,
rebootType=reboot_type, moduleIgnoreErrors=True)
self.advancedReboot = AdvancedReboot(request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo,
creds, rebootType=reboot_type, moduleIgnoreErrors=True)
except Exception:
self.sub_test_result = False
self.test_failures = self.test_failures + 1
Expand Down Expand Up @@ -355,7 +355,7 @@ def test_teardown(self):

@pytest.mark.device_type('vs')
def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname,
ptfhost, localhost, conn_graph_facts, tbinfo, creds):
ptfhost, localhost, vmhost, conn_graph_facts, tbinfo, creds):
"""
@summary: This test performs continuous reboot cycles on images that are provided as an input.
Supported parameters for this test can be modified at runtime:
Expand All @@ -380,5 +380,5 @@ def test_continuous_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_h
continuous_reboot = ContinuousReboot(
request, duthost, ptfhost, localhost, conn_graph_facts)
continuous_reboot.start_continuous_reboot(
request, duthosts, duthost, ptfhost, localhost, tbinfo, creds)
request, duthosts, duthost, ptfhost, localhost, vmhost, tbinfo, creds)
continuous_reboot.test_teardown()

0 comments on commit 961a7df

Please sign in to comment.