From 3bc3f385c05fef5bdece521216f445e93136ba3d Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:11:49 +0800 Subject: [PATCH] [dualtor][mux_simulator] Fix mux simulator stuck (#15226) What is the motivation for this PR? Active-standby Dualtor is failing to talk to mux_simulator: # curl -v http://10.64.246.154:8082/mux/vms24-7/24 * Trying 10.64.246.154:8082... on the test server, TCP syn drops are reported increasing: # netstat -s | grep -i listen 1531500 times the listen queue of a socket overflowed 1531501 SYNs to LISTEN sockets dropped mux simulator sync queue is overflowing: # ss -lnt State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 129 128 0.0.0.0:8082 0.0.0.0:* It appeared that mux_simulator is stuck in the recvfrom: # strace -p 21315 strace: Process 21315 attached recvfrom(6, and there is no existing TCP connection on the test server/DUT for fd 6. mux_simulator is blocking reading from an already closed TCP connection, so subsequent HTTP requests cannot be handled properly, which resulted in the TCP sync queue overflow. How did you do it? Enable mux_simulator to work in threaded mode. Set socket timeout to 60s, if a worker thread stucks in the recvfrom like this, this will ensure the work thread exits after 60s, so no resource leak. How did you verify/test it? Run mux_simulator with the change. Signed-off-by: Longxiang Lyu --- ansible/roles/vm_set/files/mux_simulator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/roles/vm_set/files/mux_simulator.py b/ansible/roles/vm_set/files/mux_simulator.py index 81c74700785..db49c115974 100644 --- a/ansible/roles/vm_set/files/mux_simulator.py +++ b/ansible/roles/vm_set/files/mux_simulator.py @@ -8,6 +8,7 @@ import re import shlex import subprocess +import socket import sys import threading import traceback @@ -966,4 +967,5 @@ def log_message(vm_set): app.logger.info('Starting server on port {}'.format(sys.argv[1])) create_muxes(arg_vm_set) app.logger.info('####################### STARTING HTTP SERVER #######################') - app.run(host='0.0.0.0', port=http_port, threaded=False) + socket.setdefaulttimeout(60) + app.run(host='0.0.0.0', port=http_port, threaded=True) # nosemgrep