Skip to content

Commit

Permalink
Speed up route_check script (sonic-net#3544)
Browse files Browse the repository at this point in the history
This PR fixes sonic-net/sonic-buildimage#18773

How I did it
Parallely execute route_check on each Asic.
Parallelly fetch ipv4 routes and ipv6 routes.

How to verify it
execute "time route_check.py" on T2 chassis having 32k v4+32k v6 routes.
Results:
Before:
Checking routes for namespaces: ['asic0', 'asic1']

real 3m16.387s
user 1m26.084s
sys 0m7.275s

After:
time route_check.py
real 1m30.675s
user 1m33.777s
sys 0m8.209s
  • Loading branch information
deepak-singhal0408 authored Nov 3, 2024
1 parent 329fc22 commit 7cbcfda
Showing 1 changed file with 110 additions and 80 deletions.
190 changes: 110 additions & 80 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import signal
import traceback
import subprocess
import concurrent.futures

from ipaddress import ip_network
from swsscommon import swsscommon
Expand Down Expand Up @@ -338,10 +339,18 @@ def is_suppress_fib_pending_enabled(namespace):
return state == 'enabled'


def get_frr_routes(namespace):
def fetch_routes(cmd):
"""
Read routes from zebra through CLI command
:return frr routes dictionary
Fetch routes using the given command.
"""
output = subprocess.check_output(cmd, text=True)
return json.loads(output)


def get_frr_routes_parallel(namespace):
"""
Read routes from zebra through CLI command for IPv4 and IPv6 in parallel
:return combined IPv4 and IPv6 routes dictionary.
"""
if namespace == multi_asic.DEFAULT_NAMESPACE:
v4_route_cmd = ['show', 'ip', 'route', 'json']
Expand All @@ -350,12 +359,18 @@ def get_frr_routes(namespace):
v4_route_cmd = ['show', 'ip', 'route', '-n', namespace, 'json']
v6_route_cmd = ['show', 'ipv6', 'route', '-n', namespace, 'json']

output = subprocess.check_output(v4_route_cmd, text=True)
routes = json.loads(output)
output = subprocess.check_output(v6_route_cmd, text=True)
routes.update(json.loads(output))
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, routes))
return routes
with concurrent.futures.ThreadPoolExecutor() as executor:
future_v4 = executor.submit(fetch_routes, v4_route_cmd)
future_v6 = executor.submit(fetch_routes, v6_route_cmd)

# Wait for both results to complete
v4_routes = future_v4.result()
v6_routes = future_v6.result()

# Combine both IPv4 and IPv6 routes
v4_routes.update(v6_routes)
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, v4_routes))
return v4_routes


def get_interfaces(namespace):
Expand Down Expand Up @@ -556,7 +571,7 @@ def check_frr_pending_routes(namespace):
retries = FRR_CHECK_RETRIES
for i in range(retries):
missed_rt = []
frr_routes = get_frr_routes(namespace)
frr_routes = get_frr_routes_parallel(namespace)

for _, entries in frr_routes.items():
for entry in entries:
Expand Down Expand Up @@ -689,8 +704,9 @@ def _filter_out_neigh_route(routes, neighs):
return rt_appl_miss, rt_asic_miss


def check_routes(namespace):
def check_routes_for_namespace(namespace):
"""
Process a Single Namespace:
The heart of this script which runs the checks.
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
Expand All @@ -708,98 +724,113 @@ def check_routes(namespace):
:return (0, None) on sucess, else (-1, results) where results holds
the unjustifiable entries.
"""
namespace_list = []
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
namespace_list.append(namespace)
else:
namespace_list = multi_asic.get_namespace_list()
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)

results = {}
adds = {}
deletes = {}
for namespace in namespace_list:
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []
adds[namespace] = []
deletes[namespace] = []
adds = []
deletes = []
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []

selector, subs, rt_asic = get_asicdb_routes(namespace)
selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)
rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)
# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)
# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)

# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)
if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)
if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)
# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)
if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds, deletes = get_subscribe_updates(selector, subs)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds[namespace], deletes[namespace] = get_subscribe_updates(selector, subs)
# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds[namespace])
# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes)

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes[namespace])
if rt_appl_miss:
results["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if rt_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_ROUTE_TABLE_routes"] = rt_appl_miss
if intf_appl_miss:
results["missed_INTF_TABLE_entries"] = intf_appl_miss

if intf_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_INTF_TABLE_entries"] = intf_appl_miss
if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if rt_asic_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss
rt_frr_miss = check_frr_pending_routes(namespace)

rt_frr_miss = check_frr_pending_routes(namespace)
if rt_frr_miss:
results["missed_FRR_routes"] = rt_frr_miss

if rt_frr_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_FRR_routes"] = rt_frr_miss
if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} \
but all routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)

return results, adds, deletes

if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} \
but all routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)

def check_routes(namespace):
"""
Main function to parallelize route checks across all namespaces.
"""
namespace_list = []
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
namespace_list.append(namespace)
else:
namespace_list = multi_asic.get_namespace_list()
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)

results = {}
all_adds = {}
all_deletes = {}

# Use ThreadPoolExecutor to parallelize the check for each namespace
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(check_routes_for_namespace, ns): ns for ns in namespace_list}

for future in concurrent.futures.as_completed(futures):
ns = futures[future]
try:
result, adds, deletes = future.result()
if result:
results[ns] = result
all_adds[ns] = adds
all_deletes[ns] = deletes
except Exception as e:
print_message(syslog.LOG_ERR, "Error processing namespace {}: {}".format(ns, e))

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4))
print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4))
print_message(syslog.LOG_WARNING, "add: ", json.dumps(all_adds, indent=4))
print_message(syslog.LOG_WARNING, "del: ", json.dumps(all_deletes, indent=4))
return -1, results
else:
print_message(syslog.LOG_INFO, "All good!")
Expand Down Expand Up @@ -862,6 +893,5 @@ def main():
return ret, res



if __name__ == "__main__":
sys.exit(main()[0])

0 comments on commit 7cbcfda

Please sign in to comment.