From 8f57edaab2e073d3723257af11b6031a64bf226f Mon Sep 17 00:00:00 2001 From: Jose Carlos Venegas Munoz Date: Thu, 13 Feb 2020 17:59:36 +0000 Subject: [PATCH] protocols: client: Add timeout for hybrid vsock handshake When the client tries to connect sometimes a race condition could happen when the between bind and listen calls in the agent vsock side. This will block the hypervisor wanting for a response and as consequence the agent client where it checks for an OK response. This case needs to be handled by the guest kernel, see https://lore.kernel.org/netdev/668b0eda8823564cd604b1663dc53fbaece0cd4e.camel@intel.com/ As an extra protection make the agent client timeout if no OK response is given. The response should be quick so is OK to wait a few seconds and then timeout. This also allow to return an error from the dialler function so retry does not fallback on grpc retry making retries faster. Fixes: #372 Signed-off-by: Jose Carlos Venegas Munoz --- protocols/client/client.go | 51 +++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/protocols/client/client.go b/protocols/client/client.go index fb04cbab2b..aac0dd1a2d 100644 --- a/protocols/client/client.go +++ b/protocols/client/client.go @@ -9,6 +9,7 @@ package client import ( "bufio" "context" + "errors" "fmt" "net" "net/url" @@ -400,6 +401,7 @@ func HybridVSockDialer(sock string, timeout time.Duration) (net.Conn, error) { } dialFunc := func() (net.Conn, error) { + handshakeTimeout := 10 * time.Second conn, err := net.DialTimeout("unix", udsPath, timeout) if err != nil { return nil, err @@ -418,26 +420,41 @@ func HybridVSockDialer(sock string, timeout time.Duration) (net.Conn, error) { return nil, err } - // A trivial handshake is included in the host-initiated vsock connection protocol. - // It looks like this: - // - [host] CONNECT - // - [guest/success] OK - reader := bufio.NewReader(conn) - response, err := reader.ReadString('\n') - if err != nil { - conn.Close() - agentClientLog.WithField("Error", err).Debug("HybridVsock trivial handshake failed") - // for now, we temporarily rely on the backoff strategy from GRPC for more stable CI. + errChan := make(chan error) + + go func() { + reader := bufio.NewReader(conn) + response, err := reader.ReadString('\n') + if err != nil { + errChan <- err + return + } + + agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake") + + if strings.Contains(response, "OK") { + errChan <- nil + } else { + errChan <- errors.New("HybridVsock trivial handshake failed with malformed response code") + } + }() + + select { + case err = <-errChan: + if err != nil { + conn.Close() + agentClientLog.WithField("Error", err).Debug("HybridVsock trivial handshake failed") + return nil, err + + } return conn, nil - } else if !strings.Contains(response, "OK") { + case <-time.After(handshakeTimeout): + // Timeout: kernel vsock implementation has a race condition, where no response is given + // Instead of waiting forever for a response, timeout after a fair amount of time. + // See: https://lore.kernel.org/netdev/668b0eda8823564cd604b1663dc53fbaece0cd4e.camel@intel.com/ conn.Close() - agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake failed with malformd response code") - // for now, we temporarily rely on the backoff strategy from GRPC for more stable CI. - return conn, nil + return nil, errors.New("timeout waiting for hybrid vsocket handshake") } - agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake") - - return conn, nil } timeoutErr := grpcStatus.Errorf(codes.DeadlineExceeded, "timed out connecting to hybrid vsocket %s", sock)