From 83ce0a464d520a5c2d8f53ef731cb552cdb23ec8 Mon Sep 17 00:00:00 2001
From: Chenyang Wang <49756587+cyw233@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:16:57 +1100
Subject: [PATCH] feat: add more sanity checks for T2 (#15253)

Description of PR
Add BFD up count check and MAC entries count check to sanity check for T2 topo.

Summary:
Fixes # (issue) Microsoft ADO 29825439 & 29825466

Approach
What is the motivation for this PR?
During our T2 Nightly run, we found that there will be a chance that the port channel connection between 2 ASICs is up but MAC address was not learned and the BFD session between them is down. Therefore, we need to have sanity check to make sure BFD are all up and all MAC addresses are learned, otherwise issue like this will affect the test result and can impact production env.

How did you do it?
Added check_bfd_up_count() function to sanity check for T2 topo only. This check will take ~4 seconds to run on a T2 device with 3 LC (frontend nodes).
Added check_mac_entry_count() function to sanity check for T2 supervisor only. This check will take ~17 seconds to finish on a T2 device where its supervisor has 10 ASICs.
How did you verify/test it?
I ran the updated code on T2 with multiple test modules and can confirm it's checking the BFD up count and MAC entries count properly. Elastictest link: https://elastictest.org/scheduler/testplan/676bbfe8ab42af53500adb8d?leftSideViewMode=detail

Besides, I can also confirm that these 2 checks will be skipped on non-T2 devices.

Any platform specific information?
Supported testbed topology if it's a new test case?
T2

co-authorized by: jianquanye@microsoft.com
---
 tests/common/plugins/sanity_check/__init__.py |   6 +
 tests/common/plugins/sanity_check/checks.py   | 121 +++++++++++++++++-
 2 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py
index 00c6d57d3cb..f9ce9c570af 100644
--- a/tests/common/plugins/sanity_check/__init__.py
+++ b/tests/common/plugins/sanity_check/__init__.py
@@ -109,6 +109,12 @@ def filter_check_items(tbinfo, check_items):
     if 'dualtor' not in tbinfo['topo']['name'] and 'check_mux_simulator' in filtered_check_items:
         filtered_check_items.remove('check_mux_simulator')
 
+    if 't2' not in tbinfo['topo']['name']:
+        if 'check_bfd_up_count' in filtered_check_items:
+            filtered_check_items.remove('check_bfd_up_count')
+        if 'check_mac_entry_count' in filtered_check_items:
+            filtered_check_items.remove('check_mac_entry_count')
+
     return filtered_check_items
 
 
diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py
index d18c64bd020..ef54a079cc1 100644
--- a/tests/common/plugins/sanity_check/checks.py
+++ b/tests/common/plugins/sanity_check/checks.py
@@ -1,9 +1,12 @@
 import re
 import json
 import logging
+import threading
+
 import pytest
 import time
 
+from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor
 from tests.common.utilities import wait, wait_until
 from tests.common.dualtor.mux_simulator_control import get_mux_status, reset_simulator_port     # noqa F401
 from tests.common.dualtor.mux_simulator_control import restart_mux_simulator                    # noqa F401
@@ -22,6 +25,7 @@
 MONIT_STABILIZE_MAX_TIME = 500
 OMEM_THRESHOLD_BYTES = 10485760     # 10MB
 cache = FactsCache()
+lock = threading.Lock()
 
 CHECK_ITEMS = [
     'check_processes',
@@ -33,7 +37,9 @@
     'check_neighbor_macsec_empty',
     'check_ipv6_mgmt',
     'check_mux_simulator',
-    'check_orchagent_usage']
+    'check_orchagent_usage',
+    'check_bfd_up_count',
+    'check_mac_entry_count']
 
 __all__ = CHECK_ITEMS
 
@@ -1079,3 +1085,116 @@ def _check_orchagent_usage_on_dut(*args, **kwargs):
         results[dut.hostname] = check_result
 
     return _check
+
+
+@pytest.fixture(scope="module")
+def check_bfd_up_count(duthosts):
+    def _calc_expected_bfd_up_count():
+        total_lc_asics = 0
+        total_rp_asics = 0
+        for duthost in duthosts:
+            if duthost.is_supervisor_node():
+                total_rp_asics = len(duthost.asics)
+            else:
+                total_lc_asics += len(duthost.asics)
+
+        return (total_lc_asics - 1) * total_rp_asics * 2
+
+    expected_bfd_up_count = _calc_expected_bfd_up_count()
+
+    def _check(*args, **kwargs):
+        init_result = {"failed": False, "check_item": "bfd_up_count"}
+        if expected_bfd_up_count == 0:
+            logger.error("Failed to calculate expected BFD up count")
+            init_result["failed"] = True
+            return [init_result]
+
+        logger.info("Expected BFD up count is {}".format(expected_bfd_up_count))
+        result = parallel_run(_check_bfd_up_count_on_dut, args, kwargs, duthosts.frontend_nodes,
+                              timeout=600, init_result=init_result)
+
+        return list(result.values())
+
+    def _check_bfd_up_count_on_asic(asic, dut, check_result):
+        asic_id = "asic{}".format(asic.asic_index)
+        bfd_up_count_str = dut.shell("ip netns exec {} show bfd summary | grep -c 'Up'".format(asic_id))["stdout"]
+        logger.info("BFD up count on {} of {} is {}".format(asic_id, dut.hostname, bfd_up_count_str))
+        try:
+            bfd_up_count = int(bfd_up_count_str)
+        except Exception as e:
+            logger.error("Failed to parse BFD up count on {} of {}: {}".format(asic_id, dut.hostname, e))
+            bfd_up_count = -1
+
+        with lock:
+            check_result["bfd_up_count"][asic_id] = bfd_up_count
+            if bfd_up_count != expected_bfd_up_count:
+                check_result["failed"] = True
+                logger.error("BFD up count on {} of {} is not as expected.".format(asic_id, dut.hostname))
+
+    def _check_bfd_up_count_on_dut(*args, **kwargs):
+        dut = kwargs['node']
+        results = kwargs['results']
+        check_result = {"failed": False, "check_item": "bfd_up_count", "host": dut.hostname, "bfd_up_count": {}}
+        logger.info("Checking BFD up count on {}...".format(dut.hostname))
+        with SafeThreadPoolExecutor(max_workers=8) as executor:
+            for asic in dut.asics:
+                executor.submit(_check_bfd_up_count_on_asic, asic, dut, check_result)
+
+        logger.info("Done checking BFD up count on {}".format(dut.hostname))
+        results[dut.hostname] = check_result
+
+    return _check
+
+
+@pytest.fixture(scope="module")
+def check_mac_entry_count(duthosts):
+    def _calc_expected_mac_entry_count():
+        expected_count = 0
+        for duthost in duthosts.frontend_nodes:
+            expected_count += len(duthost.asics)
+
+        return expected_count
+
+    expected_mac_entry_count = _calc_expected_mac_entry_count()
+
+    def _check(*args, **kwargs):
+        init_result = {"failed": False, "check_item": "mac_entry_count"}
+        if expected_mac_entry_count == 0:
+            logger.error("Failed to calculate expected MAC entry count")
+            return [init_result]
+
+        logger.info("Expected MAC entry count is: {}".format(expected_mac_entry_count))
+        result = parallel_run(_check_mac_entry_count_on_dut, args, kwargs, duthosts.supervisor_nodes,
+                              timeout=600, init_result=init_result)
+
+        return list(result.values())
+
+    def _check_mac_entry_count_on_asic(asic, dut, check_result):
+        asic_id = "asic{}".format(asic.asic_index)
+        show_mac_output = dut.shell("ip netns exec {} show mac".format(asic_id))["stdout"]
+        try:
+            match = re.search(r'Total number of entries (\d+)', show_mac_output)
+            mac_entry_count = int(match.group(1)) if match else 0
+        except Exception as e:
+            logger.error("Failed to parse MAC entry count on {} of {}: {}".format(asic_id, dut.hostname, e))
+            mac_entry_count = -1
+
+        with lock:
+            check_result["mac_entry_count"][asic_id] = mac_entry_count
+            if mac_entry_count != expected_mac_entry_count:
+                check_result["failed"] = True
+                logger.error("MAC entry count on {} of {} is not as expected".format(asic_id, dut.hostname))
+
+    def _check_mac_entry_count_on_dut(*args, **kwargs):
+        dut = kwargs['node']
+        results = kwargs['results']
+        check_result = {"failed": False, "check_item": "mac_entry_count", "host": dut.hostname, "mac_entry_count": {}}
+        logger.info("Checking MAC entry count on {}...".format(dut.hostname))
+        with SafeThreadPoolExecutor(max_workers=8) as executor:
+            for asic in dut.asics:
+                executor.submit(_check_mac_entry_count_on_asic, asic, dut, check_result)
+
+        logger.info("Done checking MAC entry count on {}".format(dut.hostname))
+        results[dut.hostname] = check_result
+
+    return _check